Skip to content

Commit

Permalink
integrate hep_tfds, September 2022 benchmark training (#136)
Browse files Browse the repository at this point in the history
* Initial commit

* add template dataset definitions

* Add initial CMS particle-flow dataset implementation

Also changed to a new tensorflow dataset template

* add test scripts

* Run black formatting on python files

* Add instructions to cms_pf, use manual_dir for preprocessing

* fix: ability to choose data directory for the tfrecords files

* feat: Add Delphes dataset

* fix: support loading both .pkl.bz2 and .pkl

* fix: remove extra dimension in cms_pf data items

* fix cms

* fixes for delphes

* ensure dir exists

* separate cms datasets

* clarify manual dir

* cleanup print

* added singleele and singlemu

* update 1.1

* cleanup cms datasets

* update datamodel

* added new datasets

* gen/sim 12_3_0_pre6 generation (#1)

* 1.2 format, ztt dataset

* version 1.3.0 with new gensim truth

* new dataset

* add qcd

* add some asserts

* add new features

* keep PS

* add tau as pf target

* 1.3.1 remove ps and brem (#2)

* fix HF labeling (#3)

* add new high-PU QCD dataset, update energy

* up

* fix

* Add gen jet index (#4)

* first attempt at gen jet clustering

* add other reqs

* revert test

* fix mapping to before masking particles

* fix out of index bufg

* benchmark training for CMS

* move path

* move path

* remove submodule

* remove

* move

* fix import

* format

* format

* remove some dummy files

* up

* try with masking

* use a different dataset for logging the jet/met distributions

* clean

* added clic ttbar

Co-authored-by: Eric Wulff <eric.g.t.wulff@gmail.com>
Co-authored-by: Eric Wulff <eric.wulff@cern.ch>
Co-authored-by: Javier Duarte <jduarte@ucsd.edu>
Former-commit-id: fb89d79
  • Loading branch information
4 people authored Sep 2, 2022
1 parent 296edaa commit 05e14e8
Show file tree
Hide file tree
Showing 45 changed files with 2,127 additions and 961 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ jobs:
- name: Install python deps
run: |
pip install -r requirements.txt
pip install ./hep_tfds
HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras]
- name: Run delphes TF model
run: ./scripts/local_test_delphes_pipeline.sh
Expand All @@ -38,7 +37,6 @@ jobs:
- name: Install python deps
run: |
pip install -r requirements.txt
pip install ./hep_tfds
HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras]
- name: Run CMS TF model using the pipeline
run: ./scripts/local_test_cms_pipeline.sh
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
[submodule "hep_tfds"]
path = hep_tfds
url = https://github.com/jpata/hep_tfds
1 change: 0 additions & 1 deletion hep_tfds
Submodule hep_tfds deleted from 31baf1
235 changes: 235 additions & 0 deletions mlpf/data_clic/postprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import bz2
import json

import networkx as nx
import numpy as np
import pandas

# 12,14,16 are neutrinos.
neutrinos = [12, 14, 16]
labels_ys_cand = [0, 211, 130, 22, 11, 13]

# this is what I can reconstruct
labels_ys_gen = [0, 211, 130, 22, 11, 13]


def prepare_data_clic(fn):
def map_pdgid_to_candid(pdgid, charge):
if pdgid in [0, 22, 11, 13]:
return pdgid

# charged hadron
if abs(charge) > 0:
return 211

# neutral hadron
return 130

def track_pt(omega):
return a * np.abs(b / omega)

def track_as_array(df_tr, itr):
row = df_tr.loc[itr]
return [0, row["px"], row["py"], row["pz"], row["nhits"], row["d0"], row["z0"]]

def cluster_as_array(df_cl, icl):
row = df_cl.loc[icl]
return [1, row["x"], row["y"], row["z"], row["nhits_ecal"], row["nhits_hcal"], row["energy"]]

def gen_as_array(df_gen, igen):
if igen:
row = df_gen.loc[igen]
return np.array([abs(row["pdgid"]), row["charge"], row["px"], row["py"], row["pz"], row["energy"]])
else:
return np.zeros(6)

def pf_as_array(df_pfs, igen):
if igen:
row = df_pfs.loc[igen]
return np.array([abs(row["type"]), row["charge"], row["px"], row["py"], row["pz"], row["energy"]])
else:
return np.zeros(6)

def filter_gp(gp):
row = df_gen.loc[gp]
if row["status"] == 1 and row["energy"] > 0.2:
return True
return False

def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs):
Xs = []
ys_gen = []
ys_cand = []

# find all track-associated particles
for itr in range(len(df_tr)):

k = ("tr", itr)
gp = None
rp = None
if k in pairs:
gp = pairs[k][0]
rp = pairs[k][1]

# normalize ysgen and yscand
ys = gen_as_array(df_gen, gp)
cand = pf_as_array(df_pfs, rp)
# skip the neutrinos
if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos):
continue
else:
ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1]))
cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1]))
ys_gen.append(np.delete(ys, -1))
ys_cand.append(np.delete(cand, -1))
Xs.append(track_as_array(df_tr, itr))

# find all cluster-associated particles
for icl in range(len(df_cl)):

k = ("cl", icl)
gp = None
rp = None
if k in pairs:
gp = pairs[k][0]
rp = pairs[k][1]

# normalize ysgen and yscand
ys = gen_as_array(df_gen, gp)
cand = pf_as_array(df_pfs, rp)
# skip the neutrinos
if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos):
continue
else:
ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1]))
cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1]))
# if icl == 5:
# print(ys[0], ys[-1])
ys_gen.append(np.delete(ys, -1))
ys_cand.append(np.delete(cand, -1))
Xs.append(cluster_as_array(df_cl, icl))

Xs = np.stack(Xs, axis=-1).T
ys_gen = np.stack(ys_gen, axis=-1).T
# print("ys_gen flatten",ys_gen[:10])
ys_cand = np.stack(ys_cand, axis=-1).T

return Xs, ys_gen, ys_cand

data = json.load(bz2.BZ2File(fn, "r"))
a = 3 * 10**-4
b = 5 # B-field in tesla

ret = []
for iev in range(len(data)):
df_gen = pandas.DataFrame(data[iev]["genparticles"])

# df_hit = pandas.DataFrame(data[iev]["track_hits"])
df_cl = pandas.DataFrame(data[iev]["clusters"])
df_tr = pandas.DataFrame(data[iev]["tracks"])
# df_ecal = pandas.DataFrame(data[iev]["ecal_hits"])
# df_hcal = pandas.DataFrame(data[iev]["hcal_hits"])
df_pfs = pandas.DataFrame(data[iev]["pfs"])

df_tr["pt"] = track_pt(df_tr["omega"])
df_tr["px"] = np.cos(df_tr["phi"]) * df_tr["pt"]
df_tr["py"] = np.sin(df_tr["phi"]) * df_tr["pt"]
df_tr["pz"] = df_tr["tan_lambda"] * df_tr["pt"]

matrix_tr_to_gp = np.zeros((len(df_tr), len(df_gen)))
matrix_cl_to_gp = np.zeros((len(df_cl), len(df_gen)))

for itr in range(len(df_tr)):
gps = df_tr.loc[itr]["gp_contributions"]
for gp, val in gps.items():
matrix_tr_to_gp[itr, int(gp)] += val

for icl in range(len(df_cl)):
gps = df_cl.loc[icl]["gp_contributions"]
for gp, val in gps.items():
matrix_cl_to_gp[icl, int(gp)] += val

reco_to_pf = {}
for ipf in range(len(df_pfs)):
row = df_pfs.loc[ipf]
if row["track_idx"] != -1:
k = ("tr", int(row["track_idx"]))
assert not (k in reco_to_pf)
reco_to_pf[k] = ipf
elif row["cluster_idx"] != -1:
k = ("cl", int(row["cluster_idx"]))
assert not (k in reco_to_pf)
reco_to_pf[k] = ipf
else:
# PF should always have a track or a cluster associated
assert False

dg = nx.Graph()

gps = set()

# loop over clusters, get all genparticles associated to clusters
for icl in range(len(df_cl)):
row = df_cl.loc[icl]
dg.add_node(("cl", icl))
for gp, weight in row["gp_contributions"].items():
gp = int(gp)
if filter_gp(gp):
dg.add_node(("gp", gp))
gps.add(gp)
dg.add_edge(("gp", gp), ("cl", icl), weight=weight)

# loop over tracks, get all genparticles associated to tracks
for itr in range(len(df_tr)):
row = df_tr.loc[itr]
dg.add_node(("tr", itr))
for gp in row["gp_contributions"].keys():
gp = int(gp)
if filter_gp(gp):
dg.add_node(("gp", gp))
gps.add(gp)

# the track is added to the genparticle with a very high weight
# because we always want to associate the genparticle to a track if it's possible
dg.add_edge(("gp", gp), ("tr", itr), weight=9999.0)

# uniqe genparticles
gps = set(gps)

# now loop over all the genparticles
pairs = {}
for gp in gps:
gp_node = ("gp", gp)

# find the neighboring reco elements (clusters and tracks)
neighbors = list(dg.neighbors(gp_node))
weights = [dg.edges[gp_node, n]["weight"] for n in neighbors]
nw = zip(neighbors, weights)

# sort the neighbors by the edge weight (deposited energy)
nw = sorted(nw, key=lambda x: x[1], reverse=True)
reco_obj = None
if len(nw) > 0:
# choose the closest neighbor as the "key" reco element
reco_obj = nw[0][0]

# remove the reco element from the list, so it can't be associated to anything else
dg.remove_node(reco_obj)

# this genparticle had a unique reco element
if reco_obj:
pf_obj = None
if reco_obj and reco_obj in reco_to_pf:
pf_obj = reco_to_pf[reco_obj]

assert not (reco_obj in pairs)
pairs[reco_obj] = (gp, pf_obj)

# this is a case where a genparticle did not have a key reco element, but instead was smeared between others
# else:
# print("genparticle {} is merged and cannot be reconstructed".format(gp))
# print(df_gen.loc[gp])

Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs)
ret.append((Xs, ys_gen, ys_cand))
return ret
File renamed without changes.
2 changes: 1 addition & 1 deletion mlpf/data/genjob.sh → mlpf/data_cms/genjob.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,6 @@ cmsRun step2_phase1_new.py
cmsRun step3_phase1_new.py
cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
mv pfntuple.root pfntuple_${SEED}.root
python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
bzip2 -z pfntuple_${SEED}.pkl
#rm step*.root
4 changes: 2 additions & 2 deletions mlpf/data/genjob_pu.sh → mlpf/data_cms/genjob_pu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WORKDIR=`pwd`/$SAMPLE/$SEED
mkdir -p $WORKDIR

PILEUP=Run3_Flat55To75_PoissonOOTPU
PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data/pu_files_local.txt
PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data_cms/pu_files_local.txt

N=100

Expand Down Expand Up @@ -65,6 +65,6 @@ cmsRun step2_phase1_new.py
cmsRun step3_phase1_new.py
cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
mv pfntuple.root pfntuple_${SEED}.root
python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
bzip2 -z pfntuple_${SEED}.pkl
#rm step*.root
File renamed without changes.
File renamed without changes.
31 changes: 14 additions & 17 deletions mlpf/data/prepare_args.py → mlpf/data_cms/prepare_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@
outdir = "/hdfs/local/joosep/mlpf/gen/v2"

samples = [
# "SinglePiMinusFlatPt0p7To1000_cfi",
# "SingleGammaFlatPt1To1000_pythia8_cfi",
# "SingleElectronFlatPt1To1000_pythia8_cfi",
# "SingleTauFlatPt1To1000_cfi",
# "SinglePi0Pt1To1000_pythia8_cfi",
# "SingleProtonMinusFlatPt0p7To1000_cfi",
# "SingleNeutronFlatPt0p7To1000_cfi",
# "SingleMuFlatLogPt_100MeVto2TeV_cfi",
"SinglePiMinusFlatPt0p7To1000_cfi",
"SingleGammaFlatPt1To1000_pythia8_cfi",
"SingleElectronFlatPt1To1000_pythia8_cfi",
"SingleTauFlatPt1To1000_cfi",
"SinglePi0Pt1To1000_pythia8_cfi",
"SingleProtonMinusFlatPt0p7To1000_cfi",
"SingleNeutronFlatPt0p7To1000_cfi",
"SingleMuFlatLogPt_100MeVto2TeV_cfi",
]

samples_pu = [
"TTbar_14TeV_TuneCUETP8M1_cfi",
"ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi",
"QCDForPF_14TeV_TuneCUETP8M1_cfi",
"QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi",
# "TTbar_14TeV_TuneCUETP8M1_cfi",
# "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi",
# "QCDForPF_14TeV_TuneCUETP8M1_cfi",
# "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi",
"SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi",
]

NUM_SAMPLES = 1000
Expand All @@ -31,14 +32,10 @@
for s in samples_pu + samples:
is_pu = s in samples_pu

num = 10
if is_pu:
num = NUM_SAMPLES

os.makedirs(outdir + "/" + s + "/raw", exist_ok=True)
os.makedirs(outdir + "/" + s + "/root", exist_ok=True)

for nsamples in range(num):
for nsamples in range(NUM_SAMPLES):
if not os.path.isfile(outdir + "/" + s + "/raw/pfntuple_{}.pkl.bz2".format(iseed)):
if is_pu:
print("sbatch mlpf/tallinn/genjob_pu.sh {} {}".format(s, iseed))
Expand Down
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 05e14e8

Please sign in to comment.