From 05e14e8bfebf1c8b118b97d4999f7cad30dfd990 Mon Sep 17 00:00:00 2001 From: Joosep Pata Date: Fri, 2 Sep 2022 10:09:18 +0300 Subject: [PATCH] integrate hep_tfds, September 2022 benchmark training (#136) * Initial commit * add template dataset definitions * Add initial CMS particle-flow dataset implementation Also changed to a new tensorflow dataset template * add test scripts * Run black formatting on python files * Add instructions to cms_pf, use manual_dir for preprocessing * fix: ability to choose data directory for the tfrecords files * feat: Add Delphes dataset * fix: support loading both .pkl.bz2 and .pkl * fix: remove extra dimension in cms_pf data items * fix cms * fixes for delphes * ensure dir exists * separate cms datasets * clarify manual dir * cleanup print * added singleele and singlemu * update 1.1 * cleanup cms datasets * update datamodel * added new datasets * gen/sim 12_3_0_pre6 generation (#1) * 1.2 format, ztt dataset * version 1.3.0 with new gensim truth * new dataset * add qcd * add some asserts * add new features * keep PS * add tau as pf target * 1.3.1 remove ps and brem (#2) * fix HF labeling (#3) * add new high-PU QCD dataset, update energy * up * fix * Add gen jet index (#4) * first attempt at gen jet clustering * add other reqs * revert test * fix mapping to before masking particles * fix out of index bufg * benchmark training for CMS * move path * move path * remove submodule * remove * move * fix import * format * format * remove some dummy files * up * try with masking * use a different dataset for logging the jet/met distributions * clean * added clic ttbar Co-authored-by: Eric Wulff Co-authored-by: Eric Wulff Co-authored-by: Javier Duarte Former-commit-id: fb89d7911d151ec2edc070aa29077f289907dfd1 --- .github/workflows/test.yml | 2 - .gitmodules | 3 - hep_tfds | 1 - mlpf/data_clic/postprocessing.py | 235 +++++ mlpf/{data => data_cms}/genjob.jdl | 0 mlpf/{data => data_cms}/genjob.sh | 2 +- mlpf/{data => data_cms}/genjob_pu.sh | 4 +- mlpf/{data => data_cms}/multicrab.py | 0 mlpf/{data => data_cms}/postprocessing2.py | 0 mlpf/{data => data_cms}/prepare_args.py | 31 +- mlpf/{data => data_cms}/pu_files.txt | 0 mlpf/{data => data_cms}/run_gen.sh | 0 mlpf/heptfds/clic_pf/ttbar.py | 89 ++ mlpf/heptfds/cms_pf/cms_pf_test.py | 26 + mlpf/heptfds/cms_pf/cms_utils.py | 211 +++++ mlpf/heptfds/cms_pf/qcd.py | 62 ++ mlpf/heptfds/cms_pf/qcd_high_pt.py | 62 ++ mlpf/heptfds/cms_pf/singleele.py | 62 ++ mlpf/heptfds/cms_pf/singlegamma.py | 61 ++ mlpf/heptfds/cms_pf/singlemu.py | 62 ++ mlpf/heptfds/cms_pf/singlepi.py | 62 ++ mlpf/heptfds/cms_pf/singlepi0.py | 61 ++ mlpf/heptfds/cms_pf/singletau.py | 61 ++ mlpf/heptfds/cms_pf/ttbar.py | 66 ++ mlpf/heptfds/cms_pf/ztt.py | 63 ++ mlpf/heptfds/delphes_pf/delphes_pf.py | 159 ++++ mlpf/heptfds/delphes_pf/delphes_pf_test.py | 26 + mlpf/tallinn/cms-gen.sh | 2 +- mlpf/tallinn/genjob.sh | 2 +- mlpf/tallinn/genjob_pu.sh | 2 +- mlpf/tallinn/submit-test-eventloss.sh | 1 - notebooks/clic.ipynb | 874 +++++------------- notebooks/cms-mlpf.ipynb | 407 ++++++-- notebooks/simvalidation.ipynb | 2 +- parameters/cms-gen.yaml | 16 +- parameters/test-eventloss/baseline-clspt.yaml | 232 ----- .../baseline-mask_reg_cls0.yaml | 19 +- parameters/test-eventloss/baseline.yaml | 19 +- parameters/test-eventloss/genjet_logcosh.yaml | 19 +- .../genjet_logcosh_mask_reg_cls0.yaml | 19 +- parameters/test-eventloss/genjet_mse.yaml | 19 +- parameters/test-eventloss/h2d.yaml | 19 +- parameters/test-eventloss/swd.yaml | 19 +- scripts/local_test_cms_pipeline.sh | 4 +- scripts/local_test_delphes_pipeline.sh | 2 +- 45 files changed, 2127 insertions(+), 961 deletions(-) delete mode 160000 hep_tfds create mode 100644 mlpf/data_clic/postprocessing.py rename mlpf/{data => data_cms}/genjob.jdl (100%) rename mlpf/{data => data_cms}/genjob.sh (92%) rename mlpf/{data => data_cms}/genjob_pu.sh (89%) rename mlpf/{data => data_cms}/multicrab.py (100%) rename mlpf/{data => data_cms}/postprocessing2.py (100%) rename mlpf/{data => data_cms}/prepare_args.py (53%) rename mlpf/{data => data_cms}/pu_files.txt (100%) rename mlpf/{data => data_cms}/run_gen.sh (100%) create mode 100644 mlpf/heptfds/clic_pf/ttbar.py create mode 100644 mlpf/heptfds/cms_pf/cms_pf_test.py create mode 100644 mlpf/heptfds/cms_pf/cms_utils.py create mode 100644 mlpf/heptfds/cms_pf/qcd.py create mode 100644 mlpf/heptfds/cms_pf/qcd_high_pt.py create mode 100644 mlpf/heptfds/cms_pf/singleele.py create mode 100644 mlpf/heptfds/cms_pf/singlegamma.py create mode 100644 mlpf/heptfds/cms_pf/singlemu.py create mode 100644 mlpf/heptfds/cms_pf/singlepi.py create mode 100644 mlpf/heptfds/cms_pf/singlepi0.py create mode 100644 mlpf/heptfds/cms_pf/singletau.py create mode 100644 mlpf/heptfds/cms_pf/ttbar.py create mode 100644 mlpf/heptfds/cms_pf/ztt.py create mode 100644 mlpf/heptfds/delphes_pf/delphes_pf.py create mode 100644 mlpf/heptfds/delphes_pf/delphes_pf_test.py delete mode 100644 parameters/test-eventloss/baseline-clspt.yaml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a35a3e9ac..eaf2791d3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,6 @@ jobs: - name: Install python deps run: | pip install -r requirements.txt - pip install ./hep_tfds HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras] - name: Run delphes TF model run: ./scripts/local_test_delphes_pipeline.sh @@ -38,7 +37,6 @@ jobs: - name: Install python deps run: | pip install -r requirements.txt - pip install ./hep_tfds HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras] - name: Run CMS TF model using the pipeline run: ./scripts/local_test_cms_pipeline.sh diff --git a/.gitmodules b/.gitmodules index 07f659ca1..e69de29bb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "hep_tfds"] - path = hep_tfds - url = https://github.com/jpata/hep_tfds diff --git a/hep_tfds b/hep_tfds deleted file mode 160000 index 31baf14de..000000000 --- a/hep_tfds +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 31baf14defc53dcd1d7555e4a3945083e45e9304 diff --git a/mlpf/data_clic/postprocessing.py b/mlpf/data_clic/postprocessing.py new file mode 100644 index 000000000..0f83fdc79 --- /dev/null +++ b/mlpf/data_clic/postprocessing.py @@ -0,0 +1,235 @@ +import bz2 +import json + +import networkx as nx +import numpy as np +import pandas + +# 12,14,16 are neutrinos. +neutrinos = [12, 14, 16] +labels_ys_cand = [0, 211, 130, 22, 11, 13] + +# this is what I can reconstruct +labels_ys_gen = [0, 211, 130, 22, 11, 13] + + +def prepare_data_clic(fn): + def map_pdgid_to_candid(pdgid, charge): + if pdgid in [0, 22, 11, 13]: + return pdgid + + # charged hadron + if abs(charge) > 0: + return 211 + + # neutral hadron + return 130 + + def track_pt(omega): + return a * np.abs(b / omega) + + def track_as_array(df_tr, itr): + row = df_tr.loc[itr] + return [0, row["px"], row["py"], row["pz"], row["nhits"], row["d0"], row["z0"]] + + def cluster_as_array(df_cl, icl): + row = df_cl.loc[icl] + return [1, row["x"], row["y"], row["z"], row["nhits_ecal"], row["nhits_hcal"], row["energy"]] + + def gen_as_array(df_gen, igen): + if igen: + row = df_gen.loc[igen] + return np.array([abs(row["pdgid"]), row["charge"], row["px"], row["py"], row["pz"], row["energy"]]) + else: + return np.zeros(6) + + def pf_as_array(df_pfs, igen): + if igen: + row = df_pfs.loc[igen] + return np.array([abs(row["type"]), row["charge"], row["px"], row["py"], row["pz"], row["energy"]]) + else: + return np.zeros(6) + + def filter_gp(gp): + row = df_gen.loc[gp] + if row["status"] == 1 and row["energy"] > 0.2: + return True + return False + + def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs): + Xs = [] + ys_gen = [] + ys_cand = [] + + # find all track-associated particles + for itr in range(len(df_tr)): + + k = ("tr", itr) + gp = None + rp = None + if k in pairs: + gp = pairs[k][0] + rp = pairs[k][1] + + # normalize ysgen and yscand + ys = gen_as_array(df_gen, gp) + cand = pf_as_array(df_pfs, rp) + # skip the neutrinos + if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos): + continue + else: + ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1])) + cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1])) + ys_gen.append(np.delete(ys, -1)) + ys_cand.append(np.delete(cand, -1)) + Xs.append(track_as_array(df_tr, itr)) + + # find all cluster-associated particles + for icl in range(len(df_cl)): + + k = ("cl", icl) + gp = None + rp = None + if k in pairs: + gp = pairs[k][0] + rp = pairs[k][1] + + # normalize ysgen and yscand + ys = gen_as_array(df_gen, gp) + cand = pf_as_array(df_pfs, rp) + # skip the neutrinos + if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos): + continue + else: + ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1])) + cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1])) + # if icl == 5: + # print(ys[0], ys[-1]) + ys_gen.append(np.delete(ys, -1)) + ys_cand.append(np.delete(cand, -1)) + Xs.append(cluster_as_array(df_cl, icl)) + + Xs = np.stack(Xs, axis=-1).T + ys_gen = np.stack(ys_gen, axis=-1).T + # print("ys_gen flatten",ys_gen[:10]) + ys_cand = np.stack(ys_cand, axis=-1).T + + return Xs, ys_gen, ys_cand + + data = json.load(bz2.BZ2File(fn, "r")) + a = 3 * 10**-4 + b = 5 # B-field in tesla + + ret = [] + for iev in range(len(data)): + df_gen = pandas.DataFrame(data[iev]["genparticles"]) + + # df_hit = pandas.DataFrame(data[iev]["track_hits"]) + df_cl = pandas.DataFrame(data[iev]["clusters"]) + df_tr = pandas.DataFrame(data[iev]["tracks"]) + # df_ecal = pandas.DataFrame(data[iev]["ecal_hits"]) + # df_hcal = pandas.DataFrame(data[iev]["hcal_hits"]) + df_pfs = pandas.DataFrame(data[iev]["pfs"]) + + df_tr["pt"] = track_pt(df_tr["omega"]) + df_tr["px"] = np.cos(df_tr["phi"]) * df_tr["pt"] + df_tr["py"] = np.sin(df_tr["phi"]) * df_tr["pt"] + df_tr["pz"] = df_tr["tan_lambda"] * df_tr["pt"] + + matrix_tr_to_gp = np.zeros((len(df_tr), len(df_gen))) + matrix_cl_to_gp = np.zeros((len(df_cl), len(df_gen))) + + for itr in range(len(df_tr)): + gps = df_tr.loc[itr]["gp_contributions"] + for gp, val in gps.items(): + matrix_tr_to_gp[itr, int(gp)] += val + + for icl in range(len(df_cl)): + gps = df_cl.loc[icl]["gp_contributions"] + for gp, val in gps.items(): + matrix_cl_to_gp[icl, int(gp)] += val + + reco_to_pf = {} + for ipf in range(len(df_pfs)): + row = df_pfs.loc[ipf] + if row["track_idx"] != -1: + k = ("tr", int(row["track_idx"])) + assert not (k in reco_to_pf) + reco_to_pf[k] = ipf + elif row["cluster_idx"] != -1: + k = ("cl", int(row["cluster_idx"])) + assert not (k in reco_to_pf) + reco_to_pf[k] = ipf + else: + # PF should always have a track or a cluster associated + assert False + + dg = nx.Graph() + + gps = set() + + # loop over clusters, get all genparticles associated to clusters + for icl in range(len(df_cl)): + row = df_cl.loc[icl] + dg.add_node(("cl", icl)) + for gp, weight in row["gp_contributions"].items(): + gp = int(gp) + if filter_gp(gp): + dg.add_node(("gp", gp)) + gps.add(gp) + dg.add_edge(("gp", gp), ("cl", icl), weight=weight) + + # loop over tracks, get all genparticles associated to tracks + for itr in range(len(df_tr)): + row = df_tr.loc[itr] + dg.add_node(("tr", itr)) + for gp in row["gp_contributions"].keys(): + gp = int(gp) + if filter_gp(gp): + dg.add_node(("gp", gp)) + gps.add(gp) + + # the track is added to the genparticle with a very high weight + # because we always want to associate the genparticle to a track if it's possible + dg.add_edge(("gp", gp), ("tr", itr), weight=9999.0) + + # uniqe genparticles + gps = set(gps) + + # now loop over all the genparticles + pairs = {} + for gp in gps: + gp_node = ("gp", gp) + + # find the neighboring reco elements (clusters and tracks) + neighbors = list(dg.neighbors(gp_node)) + weights = [dg.edges[gp_node, n]["weight"] for n in neighbors] + nw = zip(neighbors, weights) + + # sort the neighbors by the edge weight (deposited energy) + nw = sorted(nw, key=lambda x: x[1], reverse=True) + reco_obj = None + if len(nw) > 0: + # choose the closest neighbor as the "key" reco element + reco_obj = nw[0][0] + + # remove the reco element from the list, so it can't be associated to anything else + dg.remove_node(reco_obj) + + # this genparticle had a unique reco element + if reco_obj: + pf_obj = None + if reco_obj and reco_obj in reco_to_pf: + pf_obj = reco_to_pf[reco_obj] + + assert not (reco_obj in pairs) + pairs[reco_obj] = (gp, pf_obj) + + # this is a case where a genparticle did not have a key reco element, but instead was smeared between others + # else: + # print("genparticle {} is merged and cannot be reconstructed".format(gp)) + # print(df_gen.loc[gp]) + + Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs) + ret.append((Xs, ys_gen, ys_cand)) + return ret diff --git a/mlpf/data/genjob.jdl b/mlpf/data_cms/genjob.jdl similarity index 100% rename from mlpf/data/genjob.jdl rename to mlpf/data_cms/genjob.jdl diff --git a/mlpf/data/genjob.sh b/mlpf/data_cms/genjob.sh similarity index 92% rename from mlpf/data/genjob.sh rename to mlpf/data_cms/genjob.sh index 0044e3aa8..907cb13ca 100755 --- a/mlpf/data/genjob.sh +++ b/mlpf/data_cms/genjob.sh @@ -63,6 +63,6 @@ cmsRun step2_phase1_new.py cmsRun step3_phase1_new.py cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root -python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table +python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table bzip2 -z pfntuple_${SEED}.pkl #rm step*.root diff --git a/mlpf/data/genjob_pu.sh b/mlpf/data_cms/genjob_pu.sh similarity index 89% rename from mlpf/data/genjob_pu.sh rename to mlpf/data_cms/genjob_pu.sh index dc8721a9a..8a2fc0fdc 100755 --- a/mlpf/data/genjob_pu.sh +++ b/mlpf/data_cms/genjob_pu.sh @@ -13,7 +13,7 @@ WORKDIR=`pwd`/$SAMPLE/$SEED mkdir -p $WORKDIR PILEUP=Run3_Flat55To75_PoissonOOTPU -PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data/pu_files_local.txt +PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data_cms/pu_files_local.txt N=100 @@ -65,6 +65,6 @@ cmsRun step2_phase1_new.py cmsRun step3_phase1_new.py cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root -python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table +python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table bzip2 -z pfntuple_${SEED}.pkl #rm step*.root diff --git a/mlpf/data/multicrab.py b/mlpf/data_cms/multicrab.py similarity index 100% rename from mlpf/data/multicrab.py rename to mlpf/data_cms/multicrab.py diff --git a/mlpf/data/postprocessing2.py b/mlpf/data_cms/postprocessing2.py similarity index 100% rename from mlpf/data/postprocessing2.py rename to mlpf/data_cms/postprocessing2.py diff --git a/mlpf/data/prepare_args.py b/mlpf/data_cms/prepare_args.py similarity index 53% rename from mlpf/data/prepare_args.py rename to mlpf/data_cms/prepare_args.py index 6053c512e..c10106a8a 100644 --- a/mlpf/data/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -6,21 +6,22 @@ outdir = "/hdfs/local/joosep/mlpf/gen/v2" samples = [ - # "SinglePiMinusFlatPt0p7To1000_cfi", - # "SingleGammaFlatPt1To1000_pythia8_cfi", - # "SingleElectronFlatPt1To1000_pythia8_cfi", - # "SingleTauFlatPt1To1000_cfi", - # "SinglePi0Pt1To1000_pythia8_cfi", - # "SingleProtonMinusFlatPt0p7To1000_cfi", - # "SingleNeutronFlatPt0p7To1000_cfi", - # "SingleMuFlatLogPt_100MeVto2TeV_cfi", + "SinglePiMinusFlatPt0p7To1000_cfi", + "SingleGammaFlatPt1To1000_pythia8_cfi", + "SingleElectronFlatPt1To1000_pythia8_cfi", + "SingleTauFlatPt1To1000_cfi", + "SinglePi0Pt1To1000_pythia8_cfi", + "SingleProtonMinusFlatPt0p7To1000_cfi", + "SingleNeutronFlatPt0p7To1000_cfi", + "SingleMuFlatLogPt_100MeVto2TeV_cfi", ] samples_pu = [ - "TTbar_14TeV_TuneCUETP8M1_cfi", - "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", - "QCDForPF_14TeV_TuneCUETP8M1_cfi", - "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", + # "TTbar_14TeV_TuneCUETP8M1_cfi", + # "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", + # "QCDForPF_14TeV_TuneCUETP8M1_cfi", + # "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", + "SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", ] NUM_SAMPLES = 1000 @@ -31,14 +32,10 @@ for s in samples_pu + samples: is_pu = s in samples_pu - num = 10 - if is_pu: - num = NUM_SAMPLES - os.makedirs(outdir + "/" + s + "/raw", exist_ok=True) os.makedirs(outdir + "/" + s + "/root", exist_ok=True) - for nsamples in range(num): + for nsamples in range(NUM_SAMPLES): if not os.path.isfile(outdir + "/" + s + "/raw/pfntuple_{}.pkl.bz2".format(iseed)): if is_pu: print("sbatch mlpf/tallinn/genjob_pu.sh {} {}".format(s, iseed)) diff --git a/mlpf/data/pu_files.txt b/mlpf/data_cms/pu_files.txt similarity index 100% rename from mlpf/data/pu_files.txt rename to mlpf/data_cms/pu_files.txt diff --git a/mlpf/data/run_gen.sh b/mlpf/data_cms/run_gen.sh similarity index 100% rename from mlpf/data/run_gen.sh rename to mlpf/data_cms/run_gen.sh diff --git a/mlpf/heptfds/clic_pf/ttbar.py b/mlpf/heptfds/clic_pf/ttbar.py new file mode 100644 index 000000000..d2eae2ecc --- /dev/null +++ b/mlpf/heptfds/clic_pf/ttbar.py @@ -0,0 +1,89 @@ +from pathlib import Path + +import numpy as np +import tensorflow as tf +import tensorflow_datasets as tfds + +from mlpf.data_clic.postprocessing import prepare_data_clic + +_DESCRIPTION = """ +CLIC dataset with ttbar +""" + +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 300 + +# these labels are for tracks from track_as_array +X_FEATURES_TRK = ["type", "px", "py", "pz", "nhits", "d0", "z0"] + +# these labels are for clusters from cluster_as_array +X_FEATURES_CL = ["type", "x", "y", "z", "nhits_ecal", "nhits_hcal", "energy"] + +Y_FEATURES = ["type", "charge", "px", "py", "pz"] + + +def split_sample(path, pad_size, test_frac=0.8): + files = sorted(list(path.glob("*.json.bz2"))) + print("Found {} files in {}".format(files, path)) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + return {"train": generate_examples(files_train, pad_size), "test": generate_examples(files_test, pad_size)} + + +def generate_examples(files, pad_size): + for fi in files: + ret = prepare_data_clic(fi) + for iev, (X, ycand, ygen) in enumerate(ret): + X = X[:pad_size] + X = np.pad(X, [(0, pad_size - X.shape[0]), (0, 0)]) + ygen = ygen[:pad_size] + ygen = np.pad(ygen, [(0, pad_size - ygen.shape[0]), (0, 0)]) + ycand = ycand[:pad_size] + ycand = np.pad(ycand, [(0, pad_size - ycand.shape[0]), (0, 0)]) + + yield str(fi) + "_" + str(iev), { + "X": X.astype(np.float32), + "ygen": ygen.astype(np.float32), + "ycand": ycand.astype(np.float32), + } + + +class ClicTtbarPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.0.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, 7), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, 5), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, 5), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, x_features_cluster=X_FEATURES_CL, y_features=Y_FEATURES + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + return split_sample(Path("data/clic/gev380ee_pythia6_ttbar_rfull201/raw"), PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/cms_pf_test.py b/mlpf/heptfds/cms_pf/cms_pf_test.py new file mode 100644 index 000000000..995ec388b --- /dev/null +++ b/mlpf/heptfds/cms_pf/cms_pf_test.py @@ -0,0 +1,26 @@ +"""cms_pf dataset.""" + +import tensorflow_datasets as tfds + +from . import cms_pf + + +class CmsPfTest(tfds.testing.DatasetBuilderTestCase): + """Tests for cms_pf dataset.""" + + # TODO(cms_pf): + DATASET_CLASS = cms_pf.CmsPf + SPLITS = { + "train": 3, # Number of fake train example + "test": 1, # Number of fake test example + } + + # If you are calling `download/download_and_extract` with a dict, like: + # dl_manager.download({'some_key': 'http://a.org/out.txt', ...}) + # then the tests needs to provide the fake output paths relative to the + # fake data directory + # DL_EXTRACT_RESULT = {'some_key': 'output_file1.txt', ...} + + +if __name__ == "__main__": + tfds.testing.test_main() diff --git a/mlpf/heptfds/cms_pf/cms_utils.py b/mlpf/heptfds/cms_pf/cms_utils.py new file mode 100644 index 000000000..e551c09aa --- /dev/null +++ b/mlpf/heptfds/cms_pf/cms_utils.py @@ -0,0 +1,211 @@ +import bz2 +import pickle + +import awkward as ak +import fastjet +import numpy as np +import vector +from numpy.lib.recfunctions import append_fields + +# https://github.com/ahlinist/cmssw/blob/1df62491f48ef964d198f574cdfcccfd17c70425/DataFormats/ParticleFlowReco/interface/PFBlockElement.h#L33 +ELEM_LABELS_CMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] +ELEM_NAMES_CMS = ["NONE", "TRACK", "PS1", "PS2", "ECAL", "HCAL", "GSF", "BREM", "HFEM", "HFHAD", "SC", "HO"] + +# https://github.com/cms-sw/cmssw/blob/master/DataFormats/ParticleFlowCandidate/src/PFCandidate.cc#L254 +CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13] +CLASS_NAMES_CMS = ["none", "ch.had", "n.had", "HFHAD", "HFEM", "gamma", "ele", "mu"] +CLASS_NAMES_LONG_CMS = ["none" "charged hadron", "neutral hadron", "hfem", "hfhad", "photon", "electron", "muon"] + +X_FEATURES = [ + "typ_idx", + "pt", + "eta", + "phi", + "e", + "layer", + "depth", + "charge", + "trajpoint", + "eta_ecal", + "phi_ecal", + "eta_hcal", + "phi_hcal", + "muon_dt_hits", + "muon_csc_hits", + "muon_type", + "px", + "py", + "pz", + "deltap", + "sigmadeltap", + "gsf_electronseed_trkorecal", + "gsf_electronseed_dnn1", + "gsf_electronseed_dnn2", + "gsf_electronseed_dnn3", + "gsf_electronseed_dnn4", + "gsf_electronseed_dnn5", + "num_hits", + "cluster_flags", + "corr_energy", + "corr_energy_err", + "vx", + "vy", + "vz", + "pterror", + "etaerror", + "phierror", + "lambd", + "lambdaerror", + "theta", + "thetaerror", +] + +Y_FEATURES = ["typ_idx", "charge", "pt", "eta", "sin_phi", "cos_phi", "e", "jet_idx"] + + +def prepare_data_cms(fn, padded_num_elem_size): + Xs = [] + ygens = [] + ycands = [] + + # prepare jet definition and min jet pt for clustering gen jets + jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4) + min_jet_pt = 5.0 # GeV + + if fn.endswith(".pkl"): + data = pickle.load(open(fn, "rb"), encoding="iso-8859-1") + elif fn.endswith(".pkl.bz2"): + data = pickle.load(bz2.BZ2File(fn, "rb")) + + for event in data: + Xelem = event["Xelem"] + ygen = event["ygen"] + ycand = event["ycand"] + + # remove PS and BREM from inputs + msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3) | (Xelem["typ"] == 7) + + Xelem = Xelem[~msk_ps] + ygen = ygen[~msk_ps] + ycand = ycand[~msk_ps] + + Xelem = append_fields( + Xelem, "typ_idx", np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32) + ) + ygen = append_fields( + ygen, "typ_idx", np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["typ"]], dtype=np.float32) + ) + ygen = append_fields(ygen, "jet_idx", np.zeros(ygen["typ"].shape, dtype=np.float32)) + ycand = append_fields( + ycand, + "typ_idx", + np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["typ"]], dtype=np.float32), + ) + ycand = append_fields(ycand, "jet_idx", np.zeros(ycand["typ"].shape, dtype=np.float32)) + + Xelem_flat = np.stack( + [Xelem[k].view(np.float32).data for k in X_FEATURES], + axis=-1, + ) + ygen_flat = np.stack( + [ygen[k].view(np.float32).data for k in Y_FEATURES], + axis=-1, + ) + ycand_flat = np.stack( + [ycand[k].view(np.float32).data for k in Y_FEATURES], + axis=-1, + ) + + # take care of outliers + Xelem_flat[np.isnan(Xelem_flat)] = 0 + Xelem_flat[np.abs(Xelem_flat) > 1e4] = 0 + ygen_flat[np.isnan(ygen_flat)] = 0 + ygen_flat[np.abs(ygen_flat) > 1e4] = 0 + ycand_flat[np.isnan(ycand_flat)] = 0 + ycand_flat[np.abs(ycand_flat) > 1e4] = 0 + + X = Xelem_flat[:padded_num_elem_size] + X = np.pad(X, [(0, padded_num_elem_size - X.shape[0]), (0, 0)]) + + ygen = ygen_flat[:padded_num_elem_size] + ygen = np.pad(ygen, [(0, padded_num_elem_size - ygen.shape[0]), (0, 0)]) + + ycand = ycand_flat[:padded_num_elem_size] + ycand = np.pad(ycand, [(0, padded_num_elem_size - ycand.shape[0]), (0, 0)]) + + X = np.expand_dims(X, 0) + ygen = np.expand_dims(ygen, 0) + ycand = np.expand_dims(ycand, 0) + + # prepare gen candidates for clustering + cls_id = ygen[:, :, 0] + valid = cls_id != 0 + # save mapping of index after masking -> index before masking as numpy array + # inspired from: + # https://stackoverflow.com/questions/432112/1044443#comment54747416_1044443 + cumsum = np.cumsum(valid) - 1 + _, index_mapping = np.unique(cumsum, return_index=True) + + pt = ak.from_iter([y[m] for y, m in zip(ygen[:, :, Y_FEATURES.index("pt")], valid)]) + eta = ak.from_iter([y[m] for y, m in zip(ygen[:, :, Y_FEATURES.index("eta")], valid)]) + phi = np.arctan2(ygen[:, :, Y_FEATURES.index("sin_phi")], ygen[:, :, Y_FEATURES.index("cos_phi")]) + phi = ak.from_iter([y[m] for y, m in zip(phi, valid)]) + e = ak.from_iter([y[m] for y, m in zip(ygen[:, :, Y_FEATURES.index("e")], valid)]) + vec = vector.arr({"pt": pt, "eta": eta, "phi": phi, "e": e}) + + # cluster jets, sort jet indices in descending order by pt + cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef) + jets = cluster.inclusive_jets(min_pt=min_jet_pt) + sorted_jet_idx = ak.argsort(jets.pt, axis=-1, ascending=False).to_list()[0] + # retrieve corresponding indices of constituents + constituent_idx = cluster.constituent_index(min_pt=min_jet_pt).to_list()[0] + + # add index information to ygen and ycand + # index jets in descending order by pt starting from 1: + # 0 is null (unclustered), + # 1 is 1st highest-pt jet, + # 2 is 2nd highest-pt jet, ... + for jet_idx in sorted_jet_idx: + jet_constituents = [ + index_mapping[idx] for idx in constituent_idx[jet_idx] + ] # map back to constituent index *before* masking + ygen[0, jet_constituents, Y_FEATURES.index("jet_idx")] = jet_idx + 1 # jet index starts from 1 + ycand[0, jet_constituents, Y_FEATURES.index("jet_idx")] = jet_idx + 1 + + Xs.append(X) + ygens.append(ygen) + ycands.append(ycand) + + X = [np.concatenate(Xs)] + ygen = [np.concatenate(ygens)] + ycand = [np.concatenate(ycands)] + + return X, ygen, ycand + + +def split_sample(path, pad_size, test_frac=0.8): + files = sorted(list(path.glob("*.pkl*"))) + print("Found {} files in {}".format(files, path)) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + return {"train": generate_examples(files_train, pad_size), "test": generate_examples(files_test, pad_size)} + + +def generate_examples(files, pad_size): + """Yields examples.""" + + for fi in files: + X, ygen, ycand = prepare_data_cms(str(fi), pad_size) + for ii in range(X[0].shape[0]): + x = X[0][ii] + yg = ygen[0][ii] + yc = ycand[0][ii] + yield str(fi) + "_" + str(ii), { + "X": x, + "ygen": yg, + "ycand": yc, + } diff --git a/mlpf/heptfds/cms_pf/qcd.py b/mlpf/heptfds/cms_pf/qcd.py new file mode 100644 index 000000000..6d5a735c6 --- /dev/null +++ b/mlpf/heptfds/cms_pf/qcd.py @@ -0,0 +1,62 @@ +"""CMS PF QCD dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +QCD events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 6400 + + +class CmsPfQcd(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.4.0") + RELEASE_NOTES = { + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", + "1.3.1": "Remove PS again", + "1.4.0": "Add gen jet index information", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + FIXME + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "QCDForPF_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/qcd_high_pt.py b/mlpf/heptfds/cms_pf/qcd_high_pt.py new file mode 100644 index 000000000..8ac97ca32 --- /dev/null +++ b/mlpf/heptfds/cms_pf/qcd_high_pt.py @@ -0,0 +1,62 @@ +"""CMS PF QCD High Pt dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +QCD highpt events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 6400 + + +class CmsPfQcdHighPt(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.4.0") + RELEASE_NOTES = { + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", + "1.3.1": "Remove PS again", + "1.4.0": "Add gen jet index information", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + FIXME + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/singleele.py b/mlpf/heptfds/cms_pf/singleele.py new file mode 100644 index 000000000..d58d816b0 --- /dev/null +++ b/mlpf/heptfds/cms_pf/singleele.py @@ -0,0 +1,62 @@ +"""CMS PF SinglePi dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +SingleElectron events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 320 + + +class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_singlepi dataset.""" + + VERSION = tfds.core.Version("1.2.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + "1.1.0": "Initial release.", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleElectronFlatPt1To100_pythia8_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "SingleElectronFlatPt1To100_pythia8_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/singlegamma.py b/mlpf/heptfds/cms_pf/singlegamma.py new file mode 100644 index 000000000..1269ee3f4 --- /dev/null +++ b/mlpf/heptfds/cms_pf/singlegamma.py @@ -0,0 +1,61 @@ +"""CMS PF SinglePi dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +SingleGamma events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 320 + + +class CmsPfSingleGamma(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_singlegamma dataset.""" + + VERSION = tfds.core.Version("1.2.0") + RELEASE_NOTES = { + "1.1.0": "Initial release", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleGammaFlatPt10To100_pythia8_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "SingleGammaFlatPt10To100_pythia8_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/singlemu.py b/mlpf/heptfds/cms_pf/singlemu.py new file mode 100644 index 000000000..783a45d21 --- /dev/null +++ b/mlpf/heptfds/cms_pf/singlemu.py @@ -0,0 +1,62 @@ +"""CMS PF SinglePi dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +SingleMu events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 320 + + +class CmsPfSingleMu(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_singlepi dataset.""" + + VERSION = tfds.core.Version("1.2.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleMuFlatPt0p7To10_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "SingleMuFlatPt0p7To10_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/singlepi.py b/mlpf/heptfds/cms_pf/singlepi.py new file mode 100644 index 000000000..3f6a3c812 --- /dev/null +++ b/mlpf/heptfds/cms_pf/singlepi.py @@ -0,0 +1,62 @@ +"""CMS PF SinglePi dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +SinglePi events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 320 + + +class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_singlepi dataset.""" + + VERSION = tfds.core.Version("1.2.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePiFlatPt0p7To10_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "SinglePiFlatPt0p7To10_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/singlepi0.py b/mlpf/heptfds/cms_pf/singlepi0.py new file mode 100644 index 000000000..5f91359b6 --- /dev/null +++ b/mlpf/heptfds/cms_pf/singlepi0.py @@ -0,0 +1,61 @@ +"""CMS PF SinglePi dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +SinglePi0 events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 320 + + +class CmsPfSinglePi0(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_singlepi0 dataset.""" + + VERSION = tfds.core.Version("1.2.0") + RELEASE_NOTES = { + "1.1.0": "Initial release", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePi0E10_pythia8_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "SinglePi0E10_pythia8_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/singletau.py b/mlpf/heptfds/cms_pf/singletau.py new file mode 100644 index 000000000..0dc4efb1a --- /dev/null +++ b/mlpf/heptfds/cms_pf/singletau.py @@ -0,0 +1,61 @@ +"""CMS PF SinglePi dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +SingleTau events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 320 + + +class CmsPfSingleTau(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_singletau dataset.""" + + VERSION = tfds.core.Version("1.2.0") + RELEASE_NOTES = { + "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleTauFlatPt2To150_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "SingleTauFlatPt2To150_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py new file mode 100644 index 000000000..48a8ac40c --- /dev/null +++ b/mlpf/heptfds/cms_pf/ttbar.py @@ -0,0 +1,66 @@ +"""CMS PF TTbar dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +TTbar events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 6400 + + +class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.4.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", + "1.3.1": "Remove PS again", + "1.4.0": "Add gen jet index information", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "TTbar_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/cms_pf/ztt.py b/mlpf/heptfds/cms_pf/ztt.py new file mode 100644 index 000000000..4796ace66 --- /dev/null +++ b/mlpf/heptfds/cms_pf/ztt.py @@ -0,0 +1,63 @@ +"""CMS PF ZTT dataset.""" + +import cms_utils +import tensorflow as tf +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +ZTT events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 6400 + + +class CmsPfZtt(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.4.0") + RELEASE_NOTES = { + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", + "1.3.1": "Remove PS again", + "1.4.0": "Add gen jet index information", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path / sample_dir / "raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/mlpf/heptfds/delphes_pf/delphes_pf.py b/mlpf/heptfds/delphes_pf/delphes_pf.py new file mode 100644 index 000000000..f0ec1ac3b --- /dev/null +++ b/mlpf/heptfds/delphes_pf/delphes_pf.py @@ -0,0 +1,159 @@ +"""delphes_pf dataset.""" + +import bz2 +import os +import pickle +import resource +from pathlib import Path + +import numpy as np +import tensorflow as tf +import tensorflow_datasets as tfds + +# Increase python's soft limit on number of open files to accomodate tensorflow_datasets sharding +# https://github.com/tensorflow/datasets/issues/1441 +low, high = resource.getrlimit(resource.RLIMIT_NOFILE) +resource.setrlimit(resource.RLIMIT_NOFILE, (high, high)) + + +_DESCRIPTION = """ +Dataset generated with Delphes. + +TTbar events with PU~200. +""" + +# TODO(delphes_pf): BibTeX citation +_CITATION = """ +""" + +DELPHES_CLASS_NAMES = ["none" "charged hadron", "neutral hadron", "hfem", "hfhad", "photon", "electron", "muon"] +PADDED_NUM_ELEM_SIZE = 6400 + +# based on delphes/ntuplizer.py +X_FEATURES = [ + "typ_idx", + "pt", + "eta", + "sin_phi", + "cos_phi", + "e", + "eta_outer", + "sin_phi_outer", + "cos_phi_outer", + "charge", + "is_gen_muon", + "is_gen_electron", +] + + +class DelphesPf(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for delphes_pf dataset.""" + + VERSION = tfds.core.Version("1.0.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + } + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(delphes_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(6400, 12), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(6400, 7), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(6400, 7), dtype=tf.float32), + } + ), + # If there's a common (input, target) tuple from the + # features, specify them here. They'll be used if + # `as_supervised=True` in `builder.as_dataset`. + supervised_keys=("X", "ygen"), # Set to `None` to disable + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + delphes_dir = dl_manager.download_dir / "delphes_pf" + if delphes_dir.exists(): + print("INFO: Data already exists. Please delete {} if you want to download data again.".format(delphes_dir)) + else: + get_delphes_from_zenodo(download_dir=dl_manager.download_dir / "delphes_pf") + + ttbar_dir = delphes_dir / "pythia8_ttbar/raw" + qcd_dir = delphes_dir / "pythia8_qcd/val" + + if not ttbar_dir.exists(): + ttbar_dir.mkdir(parents=True) + for ttbar_file in delphes_dir.glob("*ttbar*.pkl.bz2"): + ttbar_file.rename(ttbar_dir / ttbar_file.name) + if not qcd_dir.exists(): + qcd_dir.mkdir(parents=True) + for qcd_file in delphes_dir.glob("*qcd*.pkl.bz2"): + qcd_file.rename(qcd_dir / qcd_file.name) + + return { + "train": self._generate_examples(delphes_dir / "pythia8_ttbar/raw"), + "test": self._generate_examples(delphes_dir / "pythia8_qcd/val"), + } + + def _generate_examples(self, path): + """Yields examples.""" + for fi in path.glob("*.pkl.bz2"): + X, ygen, ycand = self.prepare_data_delphes(str(fi)) + for ibatch in range(X.shape[0]): + yield str(fi) + "_" + str(ibatch), { + "X": X[ibatch], + "ygen": ygen[ibatch], + "ycand": ycand[ibatch], + } + + def prepare_data_delphes(self, fname): + + if fname.endswith(".pkl"): + data = pickle.load(open(fname, "rb")) + elif fname.endswith(".pkl.bz2"): + data = pickle.load(bz2.BZ2File(fname, "rb")) + else: + raise Exception("Unknown file: {}".format(fname)) + + # make all inputs and outputs the same size with padding + Xs = [] + ygens = [] + ycands = [] + for i in range(len(data["X"])): + X = np.array(data["X"][i][:PADDED_NUM_ELEM_SIZE], np.float32) + X = np.pad(X, [(0, PADDED_NUM_ELEM_SIZE - X.shape[0]), (0, 0)]) + + ygen = np.array(data["ygen"][i][:PADDED_NUM_ELEM_SIZE], np.float32) + ygen = np.pad(ygen, [(0, PADDED_NUM_ELEM_SIZE - ygen.shape[0]), (0, 0)]) + + ycand = np.array(data["ycand"][i][:PADDED_NUM_ELEM_SIZE], np.float32) + ycand = np.pad(ycand, [(0, PADDED_NUM_ELEM_SIZE - ycand.shape[0]), (0, 0)]) + + X = np.expand_dims(X, 0) + ygen = np.expand_dims(ygen, 0) + ycand = np.expand_dims(ycand, 0) + + Xs.append(X) + ygens.append(ygen) + ycands.append(ycand) + + X = np.concatenate(Xs) + ygen = np.concatenate(ygens) + ycand = np.concatenate(ycands) + + del data + return X, ygen, ycand + + +def get_delphes_from_zenodo(download_dir="."): + # url = 'https://zenodo.org/record/4559324' + zenodo_doi = "10.5281/zenodo.4559324" + print("Downloading data from {} to {}".format(zenodo_doi, download_dir)) + os.system("zenodo_get -d {} -o {}".format(zenodo_doi, download_dir)) + return Path(download_dir) diff --git a/mlpf/heptfds/delphes_pf/delphes_pf_test.py b/mlpf/heptfds/delphes_pf/delphes_pf_test.py new file mode 100644 index 000000000..6e62a0db3 --- /dev/null +++ b/mlpf/heptfds/delphes_pf/delphes_pf_test.py @@ -0,0 +1,26 @@ +"""delphes_pf dataset.""" + +import tensorflow_datasets as tfds + +from . import delphes_pf + + +class DelphesPfTest(tfds.testing.DatasetBuilderTestCase): + """Tests for delphes_pf dataset.""" + + # TODO(delphes_pf): + DATASET_CLASS = delphes_pf.DelphesPf + SPLITS = { + "train": 3, # Number of fake train example + "test": 1, # Number of fake test example + } + + # If you are calling `download/download_and_extract` with a dict, like: + # dl_manager.download({'some_key': 'http://a.org/out.txt', ...}) + # then the tests needs to provide the fake output paths relative to the + # fake data directory + # DL_EXTRACT_RESULT = {'some_key': 'output_file1.txt', ...} + + +if __name__ == "__main__": + tfds.testing.test_main() diff --git a/mlpf/tallinn/cms-gen.sh b/mlpf/tallinn/cms-gen.sh index 8254ac9af..8915b281c 100755 --- a/mlpf/tallinn/cms-gen.sh +++ b/mlpf/tallinn/cms-gen.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH -p gpu -#SBATCH --gpus 8 +#SBATCH --gpus 4 #SBATCH --mem-per-gpu=8G IMG=/home/software/singularity/tf-2.9.0.simg diff --git a/mlpf/tallinn/genjob.sh b/mlpf/tallinn/genjob.sh index 66e3d2c0a..5e6474fb1 100644 --- a/mlpf/tallinn/genjob.sh +++ b/mlpf/tallinn/genjob.sh @@ -13,7 +13,7 @@ SEED=$2 mkdir -p $WORKDIR cd $WORKDIR -/home/joosep/particleflow/mlpf/data/genjob.sh $SAMPLE $SEED +/home/joosep/particleflow/mlpf/data_cms/genjob.sh $SAMPLE $SEED #cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.root /hdfs/local/joosep/mlpf/gen/v2/$SAMPLE/root/ cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.pkl.bz2 /hdfs/local/joosep/mlpf/gen/v2/$SAMPLE/raw/ diff --git a/mlpf/tallinn/genjob_pu.sh b/mlpf/tallinn/genjob_pu.sh index 63e60ffda..afd272204 100644 --- a/mlpf/tallinn/genjob_pu.sh +++ b/mlpf/tallinn/genjob_pu.sh @@ -13,7 +13,7 @@ SEED=$2 mkdir -p $WORKDIR cd $WORKDIR -/home/joosep/particleflow/mlpf/data/genjob_pu.sh $SAMPLE $SEED +/home/joosep/particleflow/mlpf/data_cms/genjob_pu.sh $SAMPLE $SEED #cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.root /hdfs/local/joosep/mlpf/gen/v2/$SAMPLE/root/ cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.pkl.bz2 /hdfs/local/joosep/mlpf/gen/v2/$SAMPLE/raw/ diff --git a/mlpf/tallinn/submit-test-eventloss.sh b/mlpf/tallinn/submit-test-eventloss.sh index 86470f9a0..9e9bd5c15 100644 --- a/mlpf/tallinn/submit-test-eventloss.sh +++ b/mlpf/tallinn/submit-test-eventloss.sh @@ -1,7 +1,6 @@ sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/baseline.yaml sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/baseline-mask_reg_cls0.yaml sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/genjet_logcosh_mask_reg_cls0.yaml -sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/baseline-clspt.yaml sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/swd.yaml sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/h2d.yaml sbatch mlpf/tallinn/cms-mlpf-test.sh parameters/test-eventloss/genjet_mse.yaml diff --git a/notebooks/clic.ipynb b/notebooks/clic.ipynb index 83226bf0a..7cf43c93e 100644 --- a/notebooks/clic.ipynb +++ b/notebooks/clic.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "foster-monte", "metadata": {}, "outputs": [], @@ -16,659 +16,299 @@ "import matplotlib.pyplot as plt\n", "import plotly.graph_objects as go\n", "import networkx as nx\n", - "from networkx.drawing.nx_pydot import graphviz_layout" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "grave-trauma", - "metadata": {}, - "outputs": [], - "source": [ - "# data = json.load(bz2.BZ2File(\"/home/joosep/Downloads/pythia6_ttbar_0001_pandora.json.bz2\", \"r\"))\n", - "data = json.load(bz2.BZ2File(\"/home/joosep/Downloads/pythia6_ttbar_0001_pandora_0.json.bz2\", \"r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "loose-paradise", - "metadata": {}, - "outputs": [], - "source": [ - "# http://flc.desy.de/lcnotes/notes/localfsExplorer_read?currentPath=/afs/desy.de/group/flc/lcnotes/LC-DET-2006-004.pdf\n", - "a = 3 * 10**-4\n", - "b = 5 # B-field in tesla" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "commercial-bedroom", - "metadata": {}, - "outputs": [], - "source": [ - "def track_pt(omega):\n", - " return a * np.abs(b / omega)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "efficient-harrison", - "metadata": {}, - "outputs": [], - "source": [ - "iev = 3\n", - "df_gen = pandas.DataFrame(data[iev][\"genparticles\"])\n", - "\n", - "df_hit = pandas.DataFrame(data[iev][\"track_hits\"])\n", - "df_cl = pandas.DataFrame(data[iev][\"clusters\"])\n", - "df_tr = pandas.DataFrame(data[iev][\"tracks\"])\n", - "df_ecal = pandas.DataFrame(data[iev][\"ecal_hits\"])\n", - "df_hcal = pandas.DataFrame(data[iev][\"hcal_hits\"])\n", - "df_pfs = pandas.DataFrame(data[iev][\"pfs\"])\n", - "\n", - "df_tr[\"pt\"] = track_pt(df_tr[\"omega\"])\n", - "df_tr[\"px\"] = np.cos(df_tr[\"phi\"]) * df_tr[\"pt\"]\n", - "df_tr[\"py\"] = np.sin(df_tr[\"phi\"]) * df_tr[\"pt\"]\n", - "df_tr[\"pz\"] = df_tr[\"tan_lambda\"] * df_tr[\"pt\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28545ed0", - "metadata": {}, - "outputs": [], - "source": [ - "df_gen[df_gen[\"pdgid\"].abs() == 15]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6cc1ff5", - "metadata": {}, - "outputs": [], - "source": [ - "df_hit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9089cfae", - "metadata": {}, - "outputs": [], - "source": [ - "df_ecal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2e01940", - "metadata": {}, - "outputs": [], - "source": [ - "df_hcal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e22678f", - "metadata": {}, - "outputs": [], - "source": [ - "df_tr" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d4d980b", - "metadata": {}, - "outputs": [], - "source": [ - "df_cl" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "efc9be54", - "metadata": {}, - "outputs": [], - "source": [ - "df_gen" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a43ec1b", - "metadata": {}, - "outputs": [], - "source": [ - "df_pfs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b534a31", - "metadata": {}, - "outputs": [], - "source": [ - "g = nx.DiGraph()\n", - "for igen in range(len(df_gen)):\n", - " g.add_node(\"gen{}\".format(igen), typ=int(df_gen.iloc[igen][\"pdgid\"]), e=df_gen.iloc[igen][\"energy\"])\n", - "\n", - "for igen in range(len(df_gen)):\n", - " idx_parent0 = int(df_gen.iloc[igen][\"idx_parent0\"])\n", - " if idx_parent0 != -1:\n", - " g.add_edge(\"gen{}\".format(idx_parent0), \"gen{}\".format(igen))\n", - "\n", - "for icl in range(len(df_cl)):\n", - " g.add_node(\"clu{}\".format(icl), typ=df_cl.iloc[icl][\"type\"], e=df_cl.iloc[icl][\"energy\"])\n", - " for gp, gp_w in df_cl.iloc[icl][\"gp_contributions\"].items():\n", - " if gp_w / df_cl.iloc[icl][\"energy\"] > 0.2:\n", - " g.add_edge(\"gen{}\".format(gp), \"clu{}\".format(icl))\n", - "\n", - "for itr in range(len(df_tr)):\n", - " g.add_node(\"tra{}\".format(itr), typ=0, e=df_tr.iloc[itr][\"pt\"])\n", - " for gp, gp_w in df_tr.iloc[itr][\"gp_contributions\"].items():\n", - " if gp_w / df_tr.iloc[itr][\"nhits\"] > 0.2:\n", - " g.add_edge(\"gen{}\".format(gp), \"tra{}\".format(itr))\n", - "\n", - "for ipf in range(len(df_pfs)):\n", - " g.add_node(\"pfo{}\".format(ipf), typ=int(df_pfs.iloc[ipf][\"type\"]), e=df_pfs.iloc[ipf][\"energy\"])\n", - " cl_idx = int(df_pfs.iloc[ipf][\"cluster_idx\"])\n", - " if cl_idx != -1:\n", - " g.add_edge(\"clu{}\".format(cl_idx), \"pfo{}\".format(ipf))\n", - "\n", - " tr_idx = int(df_pfs.iloc[ipf][\"track_idx\"])\n", - " if tr_idx != -1:\n", - " g.add_edge(\"tra{}\".format(tr_idx), \"pfo{}\".format(ipf))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d321f39f", - "metadata": {}, - "outputs": [], - "source": [ - "def node_color(node):\n", - " if node.startswith(\"gen\"):\n", - " if abs(g.nodes[node][\"typ\"]) == 15:\n", - " return \"purple\"\n", - " return \"red\"\n", - " elif node.startswith(\"clu\"):\n", - " return \"blue\"\n", - " elif node.startswith(\"tra\"):\n", - " return \"green\"\n", - " else:\n", - " return \"gray\"\n", - "\n", - "\n", - "def node_label(node):\n", - " typ = node[:4]\n", - " l = \"{}\".format(g.nodes[node][\"typ\"])\n", - " return l" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51f8016d", - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(50, 30))\n", - "pos = graphviz_layout(g, prog=\"dot\")\n", - "nx.draw_networkx_nodes(\n", - " g,\n", - " pos,\n", - " node_size=[5 * g.nodes[n][\"e\"] for n in g.nodes],\n", - " node_color=[node_color(n) for n in g.nodes],\n", - ")\n", - "nx.draw_networkx_labels(g, pos, labels={n: node_label(n) for n in g.nodes}, font_size=5)\n", - "nx.draw_networkx_edges(g, pos, node_size=100.0)\n", - "plt.savefig(\"plot.svg\")" + "from networkx.drawing.nx_pydot import graphviz_layout\n", + "import glob" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9f4b6e10", - "metadata": {}, - "outputs": [], - "source": [ - "matrix_tr_to_gp = np.zeros((len(df_tr), len(df_gen)))\n", - "matrix_cl_to_gp = np.zeros((len(df_cl), len(df_gen)))\n", - "\n", - "for itr in range(len(df_tr)):\n", - " gps = df_tr.loc[itr][\"gp_contributions\"]\n", - " for gp, val in gps.items():\n", - " matrix_tr_to_gp[itr, int(gp)] += val\n", - "\n", - "for icl in range(len(df_cl)):\n", - " gps = df_cl.loc[icl][\"gp_contributions\"]\n", - " for gp, val in gps.items():\n", - " matrix_cl_to_gp[icl, int(gp)] += val" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cleared-vessel", + "execution_count": 97, + "id": "5a0eb769", "metadata": {}, "outputs": [], "source": [ - "import networkx as nx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "suspended-worst", - "metadata": {}, - "outputs": [], - "source": [ - "def filter_gp(gp):\n", - " row = df_gen.loc[gp]\n", - " if row[\"status\"] == 1 and row[\"energy\"] > 0.2:\n", - " return True\n", - " return False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "geographic-sailing", - "metadata": {}, - "outputs": [], - "source": [ - "reco_to_pf = {}\n", - "for ipf in range(len(df_pfs)):\n", - " row = df_pfs.loc[ipf]\n", - " if row[\"track_idx\"] != -1:\n", - " k = (\"tr\", int(row[\"track_idx\"]))\n", - " assert not (k in reco_to_pf)\n", - " reco_to_pf[k] = ipf\n", - " elif row[\"cluster_idx\"] != -1:\n", - " k = (\"cl\", int(row[\"cluster_idx\"]))\n", - " assert not (k in reco_to_pf)\n", - " reco_to_pf[k] = ipf\n", - " else:\n", - " # PF should always have a track or a cluster associated\n", - " assert False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "documented-crash", - "metadata": {}, - "outputs": [], - "source": [ - "dg = nx.Graph()\n", - "\n", - "gps = set()\n", - "\n", - "# loop over clusters, get all genparticles associated to clusters\n", - "for icl in range(len(df_cl)):\n", - " row = df_cl.loc[icl]\n", - " dg.add_node((\"cl\", icl))\n", - " for gp, weight in row[\"gp_contributions\"].items():\n", - " gp = int(gp)\n", - " if filter_gp(gp):\n", - " dg.add_node((\"gp\", gp))\n", - " gps.add(gp)\n", - " dg.add_edge((\"gp\", gp), (\"cl\", icl), weight=weight)\n", - "\n", - "\n", - "# loop over tracks, get all genparticles associated to tracks\n", - "for itr in range(len(df_tr)):\n", - " row = df_tr.loc[itr]\n", - " dg.add_node((\"tr\", itr))\n", - " for gp in row[\"gp_contributions\"].keys():\n", - " gp = int(gp)\n", - " if filter_gp(gp):\n", - " dg.add_node((\"gp\", gp))\n", - " gps.add(gp)\n", - "\n", - " # the track is added to the genparticle with a very high weight\n", - " # because we always want to associate the genparticle to a track if it's possible\n", - " dg.add_edge((\"gp\", gp), (\"tr\", itr), weight=9999.0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9081aa3", - "metadata": {}, - "outputs": [], - "source": [ - "node_labels = {k: \"{}={}\".format(k[0], k[1]) for k in dg.nodes}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1fe2061", - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(20, 20))\n", - "pos = nx.nx_pydot.pydot_layout(dg)\n", - "nx.draw_networkx(dg, pos=pos, labels=node_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "736406aa", - "metadata": {}, - "outputs": [], - "source": [ - "# uniqe genparticles\n", - "gps = set(gps)\n", - "\n", - "# now loop over all the genparticles\n", - "pairs = {}\n", - "for gp in gps:\n", - " gp_node = (\"gp\", gp)\n", - "\n", - " # find the neighboring reco elements (clusters and tracks)\n", - " neighbors = list(dg.neighbors(gp_node))\n", - " weights = [dg.edges[gp_node, n][\"weight\"] for n in neighbors]\n", - " nw = zip(neighbors, weights)\n", - "\n", - " # sort the neighbors by the edge weight (deposited energy)\n", - " nw = sorted(nw, key=lambda x: x[1], reverse=True)\n", - " reco_obj = None\n", - " if len(nw) > 0:\n", - " # choose the closest neighbor as the \"key\" reco element\n", - " reco_obj = nw[0][0]\n", - "\n", - " # remove the reco element from the list, so it can't be associated to anything else\n", - " dg.remove_node(reco_obj)\n", - "\n", - " # this genparticle had a unique reco element\n", - " if reco_obj:\n", - " pf_obj = None\n", - " if reco_obj and reco_obj in reco_to_pf:\n", - " pf_obj = reco_to_pf[reco_obj]\n", - "\n", - " assert not (reco_obj in pairs)\n", - " pairs[reco_obj] = (gp, pf_obj)\n", - "\n", - " # this is a case where a genparticle did not have a key reco element, but instead was smeared between others\n", - " else:\n", - " print(\"genparticle {} is merged and cannot be reconstructed\".format(gp))\n", - " print(df_gen.loc[gp])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "amazing-landing", - "metadata": {}, - "outputs": [], - "source": [ - "len(df_tr), len(df_cl), len(pairs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "proud-going", - "metadata": {}, - "outputs": [], - "source": [ - "def track_as_array(df_tr, itr):\n", - " row = df_tr.loc[itr]\n", - " return [0, row[\"px\"], row[\"py\"], row[\"pz\"], row[\"nhits\"], row[\"d0\"], row[\"z0\"]]\n", + "# 12,14,16 are neutrinos.\n", + "neutrinos = [12, 14, 16]\n", + "labels_ys_cand = [0, 211, 130, 22, 11, 13]\n", + "\n", + "# this is what I can reconstruct\n", + "labels_ys_gen = [0, 211, 130, 22, 11, 13]\n", + "\n", + "\n", + "def prepare_data_clic(fn):\n", + "\n", + " batched_data = []\n", + "\n", + " def map_pdgid_to_candid(pdgid, charge):\n", + " if pdgid in [0, 22, 11, 13]:\n", + " return pdgid\n", + "\n", + " # charged hadron\n", + " if abs(charge) > 0:\n", + " return 211\n", + "\n", + " # neutral hadron\n", + " return 130\n", + "\n", + " def track_pt(omega):\n", + " return a * np.abs(b / omega)\n", + "\n", + " def track_as_array(df_tr, itr):\n", + " row = df_tr.loc[itr]\n", + " return [0, row[\"px\"], row[\"py\"], row[\"pz\"], row[\"nhits\"], row[\"d0\"], row[\"z0\"]]\n", + "\n", + " def cluster_as_array(df_cl, icl):\n", + " row = df_cl.loc[icl]\n", + " return [1, row[\"x\"], row[\"y\"], row[\"z\"], row[\"nhits_ecal\"], row[\"nhits_hcal\"], row[\"energy\"]]\n", + "\n", + " def gen_as_array(df_gen, igen):\n", + " if igen:\n", + " row = df_gen.loc[igen]\n", + " return np.array([abs(row[\"pdgid\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n", + " else:\n", + " return np.zeros(6)\n", + "\n", + " def pf_as_array(df_pfs, igen):\n", + " if igen:\n", + " row = df_pfs.loc[igen]\n", + " return np.array([abs(row[\"type\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n", + " else:\n", + " return np.zeros(6)\n", + "\n", + " def filter_gp(gp):\n", + " row = df_gen.loc[gp]\n", + " if row[\"status\"] == 1 and row[\"energy\"] > 0.2:\n", + " return True\n", + " return False\n", + "\n", + " def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs):\n", + " Xs = []\n", + " ys_gen = []\n", + " ys_cand = []\n", + "\n", + " # find all track-associated particles\n", + " for itr in range(len(df_tr)):\n", + "\n", + " k = (\"tr\", itr)\n", + " gp = None\n", + " rp = None\n", + " if k in pairs:\n", + " gp = pairs[k][0]\n", + " rp = pairs[k][1]\n", + "\n", + " # normalize ysgen and yscand\n", + " ys = gen_as_array(df_gen, gp)\n", + " cand = pf_as_array(df_pfs, rp)\n", + " # skip the neutrinos\n", + " if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos):\n", + " continue\n", + " else:\n", + " ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1]))\n", + " cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1]))\n", + " ys_gen.append(np.delete(ys, -1))\n", + " ys_cand.append(np.delete(cand, -1))\n", + " Xs.append(track_as_array(df_tr, itr))\n", + "\n", + " # find all cluster-associated particles\n", + " for icl in range(len(df_cl)):\n", + "\n", + " k = (\"cl\", icl)\n", + " gp = None\n", + " rp = None\n", + " if k in pairs:\n", + " gp = pairs[k][0]\n", + " rp = pairs[k][1]\n", + "\n", + " # normalize ysgen and yscand\n", + " ys = gen_as_array(df_gen, gp)\n", + " cand = pf_as_array(df_pfs, rp)\n", + " # skip the neutrinos\n", + " if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos):\n", + " continue\n", + " else:\n", + " ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1]))\n", + " cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1]))\n", + " # if icl == 5:\n", + " # print(ys[0], ys[-1])\n", + " ys_gen.append(np.delete(ys, -1))\n", + " ys_cand.append(np.delete(cand, -1))\n", + " Xs.append(cluster_as_array(df_cl, icl))\n", + "\n", + " Xs = np.stack(Xs, axis=-1).T\n", + " ys_gen = np.stack(ys_gen, axis=-1).T\n", + " # print(\"ys_gen flatten\",ys_gen[:10])\n", + " ys_cand = np.stack(ys_cand, axis=-1).T\n", + "\n", + " return Xs, ys_gen, ys_cand\n", + "\n", + " data = json.load(bz2.BZ2File(fn, \"r\"))\n", + " a = 3 * 10**-4\n", + " b = 5 # B-field in tesla\n", + "\n", + " ret = []\n", + " for iev in range(len(data)):\n", + " df_gen = pandas.DataFrame(data[iev][\"genparticles\"])\n", + "\n", + " # df_hit = pandas.DataFrame(data[iev][\"track_hits\"])\n", + " df_cl = pandas.DataFrame(data[iev][\"clusters\"])\n", + " df_tr = pandas.DataFrame(data[iev][\"tracks\"])\n", + " # df_ecal = pandas.DataFrame(data[iev][\"ecal_hits\"])\n", + " # df_hcal = pandas.DataFrame(data[iev][\"hcal_hits\"])\n", + " df_pfs = pandas.DataFrame(data[iev][\"pfs\"])\n", + "\n", + " df_tr[\"pt\"] = track_pt(df_tr[\"omega\"])\n", + " df_tr[\"px\"] = np.cos(df_tr[\"phi\"]) * df_tr[\"pt\"]\n", + " df_tr[\"py\"] = np.sin(df_tr[\"phi\"]) * df_tr[\"pt\"]\n", + " df_tr[\"pz\"] = df_tr[\"tan_lambda\"] * df_tr[\"pt\"]\n", + "\n", + " matrix_tr_to_gp = np.zeros((len(df_tr), len(df_gen)))\n", + " matrix_cl_to_gp = np.zeros((len(df_cl), len(df_gen)))\n", + "\n", + " for itr in range(len(df_tr)):\n", + " gps = df_tr.loc[itr][\"gp_contributions\"]\n", + " for gp, val in gps.items():\n", + " matrix_tr_to_gp[itr, int(gp)] += val\n", + "\n", + " for icl in range(len(df_cl)):\n", + " gps = df_cl.loc[icl][\"gp_contributions\"]\n", + " for gp, val in gps.items():\n", + " matrix_cl_to_gp[icl, int(gp)] += val\n", + "\n", + " reco_to_pf = {}\n", + " for ipf in range(len(df_pfs)):\n", + " row = df_pfs.loc[ipf]\n", + " if row[\"track_idx\"] != -1:\n", + " k = (\"tr\", int(row[\"track_idx\"]))\n", + " assert not (k in reco_to_pf)\n", + " reco_to_pf[k] = ipf\n", + " elif row[\"cluster_idx\"] != -1:\n", + " k = (\"cl\", int(row[\"cluster_idx\"]))\n", + " assert not (k in reco_to_pf)\n", + " reco_to_pf[k] = ipf\n", + " else:\n", + " # PF should always have a track or a cluster associated\n", + " assert False\n", + "\n", + " dg = nx.Graph()\n", + "\n", + " gps = set()\n", + "\n", + " # loop over clusters, get all genparticles associated to clusters\n", + " for icl in range(len(df_cl)):\n", + " row = df_cl.loc[icl]\n", + " dg.add_node((\"cl\", icl))\n", + " for gp, weight in row[\"gp_contributions\"].items():\n", + " gp = int(gp)\n", + " if filter_gp(gp):\n", + " dg.add_node((\"gp\", gp))\n", + " gps.add(gp)\n", + " dg.add_edge((\"gp\", gp), (\"cl\", icl), weight=weight)\n", + "\n", + " # loop over tracks, get all genparticles associated to tracks\n", + " for itr in range(len(df_tr)):\n", + " row = df_tr.loc[itr]\n", + " dg.add_node((\"tr\", itr))\n", + " for gp in row[\"gp_contributions\"].keys():\n", + " gp = int(gp)\n", + " if filter_gp(gp):\n", + " dg.add_node((\"gp\", gp))\n", + " gps.add(gp)\n", "\n", + " # the track is added to the genparticle with a very high weight\n", + " # because we always want to associate the genparticle to a track if it's possible\n", + " dg.add_edge((\"gp\", gp), (\"tr\", itr), weight=9999.0)\n", "\n", - "def cluster_as_array(df_cl, icl):\n", - " row = df_cl.loc[icl]\n", - " return [1, row[\"x\"], row[\"y\"], row[\"z\"], row[\"nhits_ecal\"], row[\"nhits_hcal\"], 0.0]\n", + " # uniqe genparticles\n", + " gps = set(gps)\n", "\n", + " # now loop over all the genparticles\n", + " pairs = {}\n", + " for gp in gps:\n", + " gp_node = (\"gp\", gp)\n", "\n", - "def gen_as_array(df_gen, igen):\n", - " if igen:\n", - " row = df_gen.loc[igen]\n", - " return np.array([abs(row[\"pdgid\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n", - " else:\n", - " return np.zeros(6)\n", + " # find the neighboring reco elements (clusters and tracks)\n", + " neighbors = list(dg.neighbors(gp_node))\n", + " weights = [dg.edges[gp_node, n][\"weight\"] for n in neighbors]\n", + " nw = zip(neighbors, weights)\n", "\n", + " # sort the neighbors by the edge weight (deposited energy)\n", + " nw = sorted(nw, key=lambda x: x[1], reverse=True)\n", + " reco_obj = None\n", + " if len(nw) > 0:\n", + " # choose the closest neighbor as the \"key\" reco element\n", + " reco_obj = nw[0][0]\n", "\n", - "def pf_as_array(df_pfs, igen):\n", - " if igen:\n", - " row = df_pfs.loc[igen]\n", - " return np.array([abs(row[\"type\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n", - " else:\n", - " return np.zeros(6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "recent-folder", - "metadata": {}, - "outputs": [], - "source": [ - "def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs):\n", - " Xs = []\n", - " ys_gen = []\n", - " ys_cand = []\n", - "\n", - " # find all track-associated particles\n", - " for itr in range(len(df_tr)):\n", - " Xs.append(track_as_array(df_tr, itr))\n", - "\n", - " k = (\"tr\", itr)\n", - " gp = None\n", - " rp = None\n", - " if k in pairs:\n", - " gp = pairs[k][0]\n", - " rp = pairs[k][1]\n", - " ys_gen.append(gen_as_array(df_gen, gp))\n", - " ys_cand.append(pf_as_array(df_pfs, rp))\n", - "\n", - " # find all cluster-associated particles\n", - " for icl in range(len(df_cl)):\n", - " Xs.append(cluster_as_array(df_cl, icl))\n", - "\n", - " k = (\"cl\", icl)\n", - " gp = None\n", - " rp = None\n", - " if k in pairs:\n", - " gp = pairs[k][0]\n", - " rp = pairs[k][1]\n", - " ys_gen.append(gen_as_array(df_gen, gp))\n", - " ys_cand.append(pf_as_array(df_pfs, rp))\n", - "\n", - " Xs = np.stack(Xs, axis=-1).T\n", - " ys_gen = np.stack(ys_gen, axis=-1).T\n", - " ys_cand = np.stack(ys_cand, axis=-1).T\n", - "\n", - " return Xs, ys_gen, ys_cand" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "owned-rolling", - "metadata": {}, - "outputs": [], - "source": [ - "Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs)\n", - "len(Xs), len(ys_gen), len(ys_cand)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c022fce0", - "metadata": {}, - "outputs": [], - "source": [ - "import sklearn\n", - "import sklearn.metrics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16dde9e2", - "metadata": {}, - "outputs": [], - "source": [ - "np.unique(ys_gen[:, 0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "012ef075", - "metadata": {}, - "outputs": [], - "source": [ - "np.unique(ys_cand[:, 0])" + " # remove the reco element from the list, so it can't be associated to anything else\n", + " dg.remove_node(reco_obj)\n", + "\n", + " # this genparticle had a unique reco element\n", + " if reco_obj:\n", + " pf_obj = None\n", + " if reco_obj and reco_obj in reco_to_pf:\n", + " pf_obj = reco_to_pf[reco_obj]\n", + "\n", + " assert not (reco_obj in pairs)\n", + " pairs[reco_obj] = (gp, pf_obj)\n", + "\n", + " # this is a case where a genparticle did not have a key reco element, but instead was smeared between others\n", + " # else:\n", + " # print(\"genparticle {} is merged and cannot be reconstructed\".format(gp))\n", + " # print(df_gen.loc[gp])\n", + "\n", + " Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs)\n", + " ret.append((Xs, ys_gen, ys_cand))\n", + " return ret" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e9c5b8cd", + "execution_count": 118, + "id": "935316c1", "metadata": {}, "outputs": [], "source": [ - "labels = [0, 13, 11, 22, 130, 211, 321, 2112, 2212]\n", - "labels_text = {\n", - " 0: \"none\",\n", - " 13: \"mu\",\n", - " 11: \"el\",\n", - " 22: \"$\\gamma$\",\n", - " 130: \"$K^0_L$\",\n", - " 211: \"$\\pi^\\pm$\",\n", - " 321: \"$K^+$\",\n", - " 2112: \"n\",\n", - " 2212: \"p\",\n", - "}\n", - "cm = sklearn.metrics.confusion_matrix(ys_gen[:, 0], ys_cand[:, 0], labels=labels, normalize=\"true\")" + "ret = []\n", + "for fi in list(glob.glob(\"../data/clic/gev380ee_pythia6_ttbar_rfull201/raw/*.json.bz2\"))[:500]:\n", + " ret += prepare_data_clic(fi)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8817f3e5", + "execution_count": 120, + "id": "7d05d4c5", "metadata": {}, "outputs": [], "source": [ - "plt.imshow(cm, cmap=\"Blues\")\n", - "plt.xticks(range(len(labels)), [labels_text[l] for l in labels], rotation=90)\n", - "plt.yticks(range(len(labels)), [labels_text[l] for l in labels])\n", - "plt.xlabel(\"reco\")\n", - "plt.ylabel(\"gen\")" + "num_elems = [len(r[0]) for r in ret]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "swedish-special", + "execution_count": 123, + "id": "415989d8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAOoElEQVR4nO3dXYycV33H8e+vCYQKKM7L1rJs002LpYobgmWlrkCoJYISp6pTCVCqqrGQJd8ECUSr1pSLUqkXTqWSNhKK5DZRHUSBiBfFItCSmiDUiwQ2EJy3pllSR7HlxAZCACFQA/9ezHE7bHa9493ZHfvM9yON5jzneWbnf/TEv5w5M/NMqgpJUl9+adIFSJLGz3CXpA4Z7pLUIcNdkjpkuEtShy6edAEAV1xxRc3Ozk66DEm6oDz44IPfqaqZxfadF+E+OzvL3NzcpMuQpAtKkqeX2ueyjCR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdei8+IaqdMbs/ntW/NhjB64bYyXShc2ZuyR1yHCXpA4Z7pLUIcNdkjpkuEtSh/y0jKaen9BRj5y5S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtSh0YK9yTHkjyc5KEkc63vsiT3Jnmy3V/a+pPk1iTzSY4m2b6WA5AkvdS5zNx/t6quqqodbXs/cKSqtgFH2jbAtcC2dtsH3DauYiVJo1nNssxu4FBrHwKuH+q/swbuBzYk2bSK55EknaNRw72ALyV5MMm+1rexqk629rPAxtbeDDwz9NjjrU+StE5GvSrkm6vqRJJfBe5N8p/DO6uqktS5PHH7n8Q+gNe+9rXn8lBJ0jJGmrlX1Yl2fwr4HHA18NyZ5ZZ2f6odfgLYOvTwLa1v4d88WFU7qmrHzMzMykcgSXqJZcM9ySuTvPpMG3g78AhwGNjTDtsD3N3ah4Eb26dmdgIvDC3fSJLWwSjLMhuBzyU5c/y/VNW/Jvk6cFeSvcDTwLvb8V8AdgHzwI+B94y9aknSWS0b7lX1FPCGRfq/C1yzSH8BN42lOknSivgNVUnqkOEuSR0y3CWpQ4a7JHXIcJekDo36DVVNodn996z4sccOXDfGSiSdK2fuktQhZ+7qxmpeaUi9ceYuSR0y3CWpQy7LaE24RCJNljN3SeqQ4S5JHTLcJalDrrlLq7DS9xb8kpfWmjN3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktShkcM9yUVJvpnk8237yiQPJJlP8qkkL2/9l7Tt+bZ/do1qlyQt4Vxm7u8DHh/avhm4papeBzwP7G39e4HnW/8t7ThJ0joaKdyTbAGuA/6pbQd4K/Dpdsgh4PrW3t22afuvacdLktbJqDP3vwf+HPh5274c+H5Vvdi2jwObW3sz8AxA2/9CO/4XJNmXZC7J3OnTp1dWvSRpUcuGe5LfB05V1YPjfOKqOlhVO6pqx8zMzDj/tCRNvVF+Q/VNwB8k2QW8AvgV4B+ADUkubrPzLcCJdvwJYCtwPMnFwGuA7469cknSkpaduVfVB6tqS1XNAjcAX66qPwbuA97ZDtsD3N3ah9s2bf+Xq6rGWrUk6axW8zn3vwA+kGSewZr67a3/duDy1v8BYP/qSpQknatRlmX+T1V9BfhKaz8FXL3IMT8B3jWG2iRJK+Q3VCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdOqcvMUkaj9n996z4sccOXDfGStQrZ+6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOuTlB6bAar7qLunC5MxdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KFlwz3JK5J8Lcm3kjya5K9b/5VJHkgyn+RTSV7e+i9p2/Nt/+waj0GStMAoM/efAm+tqjcAVwHvSLITuBm4papeBzwP7G3H7wWeb/23tOMkSeto2XCvgR+1zZe1WwFvBT7d+g8B17f27rZN239NkoyrYEnS8kZac09yUZKHgFPAvcC3ge9X1YvtkOPA5tbeDDwD0Pa/AFy+yN/cl2Quydzp06dXNQhJ0i8aKdyr6mdVdRWwBbga+M3VPnFVHayqHVW1Y2ZmZrV/TpI05Jw+LVNV3wfuA34b2JDkzFUltwAnWvsEsBWg7X8N8N1xFCtJGs0on5aZSbKhtX8ZeBvwOIOQf2c7bA9wd2sfbtu0/V+uqhpjzZKkZYxyPfdNwKEkFzH4n8FdVfX5JI8Bn0zyN8A3gdvb8bcDH0syD3wPuGEN6pYkncWy4V5VR4E3LtL/FIP194X9PwHeNZbqJEkr4jdUJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjp08aQL0Ghm998z6RIkXUCcuUtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KFlwz3J1iT3JXksyaNJ3tf6L0tyb5In2/2lrT9Jbk0yn+Roku1rPQhJ0i8aZeb+IvCnVfV6YCdwU5LXA/uBI1W1DTjStgGuBba12z7gtrFXLUk6q2XDvapOVtU3WvuHwOPAZmA3cKgddgi4vrV3A3fWwP3AhiSbxl24JGlp57TmnmQWeCPwALCxqk62Xc8CG1t7M/DM0MOOt76Ff2tfkrkkc6dPnz7XuiVJZzFyuCd5FfAZ4P1V9YPhfVVVQJ3LE1fVwaraUVU7ZmZmzuWhkqRljHThsCQvYxDsH6+qz7bu55JsqqqTbdnlVOs/AWwdeviW1idpDFZ6EbljB64bcyU6n43yaZkAtwOPV9VHhnYdBva09h7g7qH+G9unZnYCLwwt30iS1sEoM/c3AX8CPJzkodb3l8AB4K4ke4GngXe3fV8AdgHzwI+B94yzYEnS8pYN96r6DyBL7L5mkeMLuGmVdUmSVsFvqEpShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHVopB/r0Pis9IcWpNVazX97/tDHhceZuyR1yHCXpA4Z7pLUIcNdkjrkG6or4Juiks53ztwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDy4Z7kjuSnEryyFDfZUnuTfJku7+09SfJrUnmkxxNsn0ti5ckLW6Umfs/A+9Y0LcfOFJV24AjbRvgWmBbu+0DbhtPmZKkc7FsuFfVV4HvLejeDRxq7UPA9UP9d9bA/cCGJJvGVKskaUQrXXPfWFUnW/tZYGNrbwaeGTrueOt7iST7kswlmTt9+vQKy5AkLWbVb6hWVQG1gscdrKodVbVjZmZmtWVIkoasNNyfO7Pc0u5Ptf4TwNah47a0PknSOlppuB8G9rT2HuDuof4b26dmdgIvDC3fSJLWybI/1pHkE8DvAFckOQ78FXAAuCvJXuBp4N3t8C8Au4B54MfAe9agZknSMpYN96r6oyV2XbPIsQXctNqi1ou/qCSpV35DVZI65G+oSlrWSl/lHjtw3Zgr0aicuUtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOnTB/1iHP5UnSS/lzF2SOmS4S1KHLvhlGUnnr9Usm/r7q6vjzF2SOmS4S1KHDHdJ6pBr7pLOSytdr3etfsCZuyR1yHCXpA65LCOpK378cmBNZu5J3pHkiSTzSfavxXNIkpY29pl7kouAjwJvA44DX09yuKoeG/dzSdI49fQm7losy1wNzFfVUwBJPgnsBgx3SV06H5eC1iLcNwPPDG0fB35r4UFJ9gH72uaPkjyxBrUs5wrgOxN43vPFNI/fsU+n827suXlVD/+1pXZM7A3VqjoIHJzU8wMkmauqHZOsYZKmefyO3bH3bi3eUD0BbB3a3tL6JEnrZC3C/evAtiRXJnk5cANweA2eR5K0hLEvy1TVi0neC/wbcBFwR1U9Ou7nGZOJLgudB6Z5/I59Ok3N2FNVk65BkjRmXn5AkjpkuEtSh6Yq3JMcS/JwkoeSzLW+y5Lcm+TJdn/ppOschyR3JDmV5JGhvkXHmoFb2+UijibZPrnKV2+JsX84yYl27h9Ksmto3wfb2J9I8nuTqXo8kmxNcl+Sx5I8muR9rb/7c3+WsU/FuX+JqpqaG3AMuGJB398C+1t7P3DzpOsc01jfAmwHHllurMAu4ItAgJ3AA5Oufw3G/mHgzxY59vXAt4BLgCuBbwMXTXoMqxj7JmB7a78a+K82xu7P/VnGPhXnfuFtqmbuS9gNHGrtQ8D1kytlfKrqq8D3FnQvNdbdwJ01cD+wIcmmdSl0DSwx9qXsBj5ZVT+tqv8G5hlcQuOCVFUnq+obrf1D4HEG3xrv/tyfZexL6ercLzRt4V7Al5I82C5/ALCxqk629rPAxsmUti6WGutil4w42z+KC9V729LDHUPLb92OPcks8EbgAabs3C8YO0zZuYfpC/c3V9V24FrgpiRvGd5Zg9dqU/HZ0Gkaa3Mb8BvAVcBJ4O8mWs0aS/Iq4DPA+6vqB8P7ej/3i4x9qs79GVMV7lV1ot2fAj7H4CXYc2dehrb7U5OrcM0tNdbuLxlRVc9V1c+q6ufAP/L/L7+7G3uSlzEIt49X1Wdb91Sc+8XGPk3nftjUhHuSVyZ59Zk28HbgEQaXRtjTDtsD3D2ZCtfFUmM9DNzYPjmxE3hh6CV8FxasI/8hg3MPg7HfkOSSJFcC24CvrXd945IkwO3A41X1kaFd3Z/7pcY+Lef+JSb9ju563YBfZ/DO+LeAR4EPtf7LgSPAk8C/A5dNutYxjfcTDF6C/g+DtcS9S42VwSclPsrg0wIPAzsmXf8ajP1jbWxHGfyj3jR0/Ifa2J8Arp10/asc+5sZLLkcBR5qt13TcO7PMvapOPcLb15+QJI6NDXLMpI0TQx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KH/BaR1SRZsN+isAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "gen_x = []\n", - "gen_y = []\n", - "gen_z = []\n", - "\n", - "mult = 10\n", - "for i in range(len(df_gen.loc[gps])):\n", - " gen_x += [0.0, mult * df_gen[\"px\"].values[i]]\n", - " gen_y += [0.0, mult * df_gen[\"py\"].values[i]]\n", - " gen_z += [0.0, mult * df_gen[\"pz\"].values[i]]\n", - "\n", - "points_gen = go.Scatter3d(x=gen_x, y=gen_z, z=gen_y, mode=\"lines\", line=dict(color=\"rgba(0, 0, 0, 1.0)\"), name=\"gen\")\n", - "\n", - "trk_x = []\n", - "trk_y = []\n", - "trk_z = []\n", - "\n", - "mult = 40\n", - "for i in range(len(df_tr)):\n", - " trk_x += [0.0, mult * df_tr[\"px\"].values[i]]\n", - " trk_y += [0.0, mult * df_tr[\"py\"].values[i]]\n", - " trk_z += [0.0, mult * df_tr[\"pz\"].values[i]]\n", - "\n", - "points_trk = go.Scatter3d(x=trk_x, y=trk_y, z=trk_z, mode=\"lines\", line=dict(color=\"rgba(0, 255, 0, 1.0)\"), name=\"tracks\")\n", - "\n", - "points_ecal = go.Scatter3d(\n", - " x=df_ecal[\"x\"].values,\n", - " y=df_ecal[\"z\"].values,\n", - " z=df_ecal[\"y\"].values,\n", - " mode=\"markers\",\n", - " marker={\"symbol\": \"square\", \"opacity\": 0.5, \"color\": \"blue\", \"size\": 1.0},\n", - " name=\"ECAL\",\n", - ")\n", - "\n", - "points_hcal = go.Scatter3d(\n", - " x=df_hcal[\"x\"].values,\n", - " y=df_hcal[\"z\"].values,\n", - " z=df_hcal[\"y\"].values,\n", - " mode=\"markers\",\n", - " marker={\"symbol\": \"square\", \"opacity\": 0.5, \"color\": \"red\", \"size\": 1.0},\n", - " name=\"HCAL\",\n", - ")\n", - "\n", - "points_clusters = go.Scatter3d(\n", - " x=df_cl[\"x\"].values,\n", - " y=df_cl[\"z\"].values,\n", - " z=df_cl[\"y\"].values,\n", - " mode=\"markers\",\n", - " marker={\"symbol\": \"square\", \"opacity\": 0.8, \"color\": \"gray\", \"size\": 5.0},\n", - " name=\"clusters\",\n", - ")\n", - "\n", - "points_hit = go.Scatter3d(\n", - " x=df_hit[\"x\"].values,\n", - " y=df_hit[\"z\"].values,\n", - " z=df_hit[\"y\"].values,\n", - " mode=\"markers\",\n", - " marker={\"symbol\": \"square\", \"opacity\": 0.8, \"color\": \"green\", \"size\": 1.0},\n", - " name=\"hits\",\n", - ")\n", - "\n", - "fig = go.Figure(data=[points_gen, points_trk, points_ecal, points_hcal, points_clusters, points_hit])\n", - "\n", - "fig.update_layout(\n", - " autosize=False,\n", - " width=700,\n", - " height=500,\n", - " margin=go.layout.Margin(\n", - " l=50,\n", - " r=0,\n", - " b=0,\n", - " t=50,\n", - " ),\n", - " scene_camera={\"eye\": dict(x=0.8, y=0.8, z=0.8)},\n", - ")\n", - "\n", - "fig.show()" + "plt.hist(num_elems, bins=21);" ] }, { "cell_type": "code", "execution_count": null, - "id": "composed-toyota", + "id": "604cc0cf", "metadata": {}, "outputs": [], "source": [] @@ -676,7 +316,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb index a7ef8ac87..c7621088a 100644 --- a/notebooks/cms-mlpf.ipynb +++ b/notebooks/cms-mlpf.ipynb @@ -52,6 +52,16 @@ "!ls -lrt ../experiments/" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "19f44820", + "metadata": {}, + "outputs": [], + "source": [ + "!du -csh ../experiments/cms-gen_20220831_084535_573597.gpu0.local/evaluation/epoch_19/*" + ] + }, { "cell_type": "code", "execution_count": null, @@ -65,9 +75,13 @@ "source": [ "# These can be overriden from the command line using `papermill cms-mlpf.ipynb -p path new/path/...`\n", "backend = \"tf\"\n", - "sample = \"ttbar\"\n", - "path = \"../experiments/genjet_logcosh_20220828_091350_455904.gpu0.local/evaluation/epoch_49/cms_pf_ttbar/\"\n", - "PAPERMILL_OUTPUT_PATH = \"./\"" + "sample = \"qcd_high_pt\"\n", + "# sample_nice = r\"$Z\\rightarrow \\tau \\tau$+PU events\"\n", + "sample_nice = \"high-$p_T$ QCD+PU events\"\n", + "# sample_nice = \"QCD+PU events\"\n", + "# sample_nice = \"$\\mathrm{t}\\bar{\\mathrm{t}}\"\n", + "path = \"../experiments/cms-gen_20220831_084535_573597.gpu0.local/evaluation/epoch_19/cms_pf_qcd_high_pt/\"\n", + "PAPERMILL_OUTPUT_PATH = path" ] }, { @@ -117,7 +131,7 @@ "\n", "\n", "def sample_label(ax, additional_text=\"\", x=0.01, y=0.87):\n", - " plt.text(x, y, \"$\\mathrm{t}\\overline{\\mathrm{t}}$ events\" + additional_text, ha=\"left\", transform=ax.transAxes)" + " plt.text(x, y, sample_nice + additional_text, ha=\"left\", transform=ax.transAxes)" ] }, { @@ -165,8 +179,8 @@ "ELEM_LABELS_CMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n", "ELEM_NAMES_CMS = [\"NONE\", \"TRACK\", \"PS1\", \"PS2\", \"ECAL\", \"HCAL\", \"GSF\", \"BREM\", \"HFEM\", \"HFHAD\", \"SC\", \"HO\"]\n", "\n", - "CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13, 15]\n", - "CLASS_NAMES_CMS = [\"none\", \"ch.had\", \"n.had\", \"HFEM\", \"HFHAD\", \"$\\gamma$\", \"$e^\\pm$\", \"$\\mu^\\pm$\", r\"$\\tau$\"]\n", + "CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13]\n", + "CLASS_NAMES_CMS = [\"none\", \"ch.had\", \"n.had\", \"HFEM\", \"HFHAD\", \"$\\gamma$\", \"$e^\\pm$\", \"$\\mu^\\pm$\"]\n", "\n", "class_names = {k: v for k, v in zip(CLASS_LABELS_CMS, CLASS_NAMES_CMS)}" ] @@ -320,7 +334,7 @@ "metadata": {}, "outputs": [], "source": [ - "b = np.linspace(-5, 5, 100)\n", + "b = np.linspace(-7, 7, 61)\n", "\n", "plt.figure()\n", "ax = plt.axes()\n", @@ -423,7 +437,7 @@ "source": [ "fig = plt.figure()\n", "ax = plt.axes()\n", - "b = np.linspace(-2, 20, 101)\n", + "b = np.linspace(-2, 50, 101)\n", "vals_a = (cand_met - gen_met) / gen_met\n", "vals_b = (pred_met - gen_met) / gen_met\n", "\n", @@ -448,6 +462,238 @@ "plt.savefig(\"{}/met.pdf\".format(outpath), bbox_inches=\"tight\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ce908fd", + "metadata": {}, + "outputs": [], + "source": [ + "b = np.linspace(-1, 100, 100)\n", + "plt.hist(cand_met / gen_met, bins=b, histtype=\"step\", lw=2, label=\"PF\")\n", + "plt.hist(pred_met / gen_met, bins=b, histtype=\"step\", lw=2, label=\"MLPF\")\n", + "plt.xlabel(\"MET reco/gen\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d675ed6", + "metadata": {}, + "outputs": [], + "source": [ + "mask_bad = (np.abs(yvals[\"gen_pt\"] - yvals[\"pred_pt\"]) > 10)[:, :, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3764e00b", + "metadata": {}, + "outputs": [], + "source": [ + "bad = np.round(\n", + " np.concatenate(\n", + " [\n", + " X[mask_bad],\n", + " yvals[\"gen_cls_id\"][mask_bad],\n", + " yvals[\"pred_cls_id\"][mask_bad],\n", + " yvals[\"gen_pt\"][mask_bad],\n", + " yvals[\"pred_pt\"][mask_bad],\n", + " np.abs(yvals[\"gen_pt\"][mask_bad] - yvals[\"pred_pt\"][mask_bad]),\n", + " yvals[\"cand_pt\"][mask_bad],\n", + " ],\n", + " axis=-1,\n", + " ),\n", + " 2,\n", + ")\n", + "\n", + "df = pandas.DataFrame(bad[np.argsort(bad[:, -2])[::-1]])\n", + "df_hcal = df[df[0] == 5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de176913", + "metadata": {}, + "outputs": [], + "source": [ + "yvals[\"gen_energy\"][mask_bad]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14a4c06c", + "metadata": {}, + "outputs": [], + "source": [ + "msk_lowpt = (\n", + " (yvals[\"cand_pt\"] > 0) & (yvals[\"pred_pt\"] > 0) & (yvals[\"gen_pt\"] > 0) & (yvals[\"gen_pt\"] < 5) & (X[:, :, 0:1] == 5)\n", + ")[:, :, 0]\n", + "msk_highpt = ((yvals[\"cand_pt\"] > 0) & (yvals[\"pred_pt\"] > 0) & (yvals[\"gen_pt\"] >= 5) & (X[:, :, 0:1] == 5))[:, :, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74706d5f", + "metadata": {}, + "outputs": [], + "source": [ + "b = np.logspace(0, 6, 100)\n", + "plt.hist(\n", + " yvals[\"pred_pt\"][msk_lowpt] / yvals[\"gen_pt\"][msk_lowpt],\n", + " bins=b,\n", + " density=1,\n", + " histtype=\"step\",\n", + " lw=2,\n", + " label=\"Gen pT < 5 GeV\",\n", + ")\n", + "plt.hist(\n", + " yvals[\"cand_pt\"][msk_lowpt] / yvals[\"gen_pt\"][msk_lowpt],\n", + " bins=b,\n", + " density=1,\n", + " histtype=\"step\",\n", + " lw=2,\n", + " label=\"Gen pT < 5 GeV\",\n", + ")\n", + "\n", + "# plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"particle pT (MLPF / gen)\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15bdc6d0", + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(\n", + " yvals[\"pred_pt\"][msk_highpt] / yvals[\"gen_pt\"][msk_highpt],\n", + " bins=b,\n", + " density=1,\n", + " histtype=\"step\",\n", + " lw=2,\n", + " label=\"Gen pT > 5 GeV\",\n", + ")\n", + "plt.hist(\n", + " yvals[\"cand_pt\"][msk_highpt] / yvals[\"gen_pt\"][msk_highpt],\n", + " bins=b,\n", + " density=1,\n", + " histtype=\"step\",\n", + " lw=2,\n", + " label=\"Gen pT > 5 GeV\",\n", + ")\n", + "\n", + "# plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"particle pT (MLPF / gen)\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2536d47d", + "metadata": {}, + "outputs": [], + "source": [ + "sdf = df[:1000]\n", + "for typ in np.unique(sdf[0]):\n", + " _sdf = sdf[sdf[0] == typ]\n", + " plt.scatter(_sdf[43], _sdf[44], label=int(typ))\n", + "\n", + "plt.legend()\n", + "plt.xlabel(\"Gen pT\")\n", + "plt.ylabel(\"Pred pT\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3446395", + "metadata": {}, + "outputs": [], + "source": [ + "df_hcal_goodcls = df_hcal[(df_hcal[41] == 2) & (df_hcal[42] == 2)][:1000]\n", + "plt.scatter(df_hcal_goodcls[43], df_hcal_goodcls[44])\n", + "plt.xlabel(\"Gen pT\")\n", + "plt.ylabel(\"Pred pT\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea9e2244", + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(df_hcal_goodcls[43], df_hcal_goodcls[46])\n", + "plt.xlabel(\"Gen pT\")\n", + "plt.ylabel(\"PF pT\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bc001b", + "metadata": {}, + "outputs": [], + "source": [ + "df_hcal_badcls = df_hcal[df_hcal[41] != df_hcal[42]]\n", + "plt.scatter(df_hcal_badcls[43], df_hcal_badcls[44])\n", + "plt.xlabel(\"Gen pT\")\n", + "plt.ylabel(\"Pred pT\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "599d8dff", + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(df_hcal_badcls[43], df_hcal_badcls[46])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25578397", + "metadata": {}, + "outputs": [], + "source": [ + "b = np.linspace(0, 3000, 100)\n", + "plt.hist(df_hcal_goodcls[45], bins=b, histtype=\"step\", lw=2, label=\"HCAL elems, good cls\")\n", + "plt.hist(df_hcal_badcls[45], bins=b, histtype=\"step\", lw=2, label=\"HCAL elems, bad cls\")\n", + "plt.yscale(\"log\")\n", + "plt.xlabel(\"|Gen-Pred| pT\")\n", + "plt.ylabel(\"Number of elems\")\n", + "plt.legend(loc=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70b0264a", + "metadata": {}, + "outputs": [], + "source": [ + "b = np.linspace(0, 3000, 100)\n", + "plt.hist(df_hcal_goodcls[46], bins=b, histtype=\"step\", lw=2, label=\"HCAL elems, good cls\", density=1)\n", + "plt.hist(df_hcal_badcls[46], bins=b, histtype=\"step\", lw=2, label=\"HCAL elems, bad cls\", density=1)\n", + "plt.yscale(\"log\")\n", + "plt.xlabel(\"PF pT\")\n", + "plt.ylabel(\"Number of elems\")\n", + "plt.legend(loc=1)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -523,28 +769,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "075b6a9f", - "metadata": {}, - "outputs": [], - "source": [ - "fig = plt.figure()\n", - "ax = plt.axes()\n", - "\n", - "plt.scatter(np.sum(yvals[\"gen_energy\"], axis=1), np.sum(yvals[\"cand_energy\"], axis=1), alpha=0.5, label=\"PF\")\n", - "plt.scatter(np.sum(yvals[\"gen_energy\"], axis=1), np.sum(yvals[\"pred_energy\"], axis=1), alpha=0.5, label=\"MLPF\")\n", - "plt.plot([10000, 80000], [10000, 80000], color=\"black\")\n", - "plt.legend(loc=4)\n", - "cms_label(ax)\n", - "sample_label(ax)\n", - "plt.xlabel(\"Gen $\\sum E$ [GeV]\")\n", - "plt.ylabel(\"Reconstructed $\\sum E$ [GeV]\")\n", - "\n", - "plt.savefig(\"{}/sum_energy.pdf\".format(outpath), bbox_inches=\"tight\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -569,6 +793,49 @@ "plt.savefig(\"{}/sum_pt.pdf\".format(outpath), bbox_inches=\"tight\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "83c813a4", + "metadata": {}, + "outputs": [], + "source": [ + "def reso_plot(pid, variable, bins):\n", + " fig = plt.figure()\n", + " ax = plt.axes()\n", + "\n", + " msk = (yvals_f[\"gen_cls_id\"] == pid) & (yvals_f[\"cand_cls_id\"] != 0) & (yvals_f[\"pred_cls_id\"] != 0)\n", + " vals_gen = yvals_f[\"gen_{}\".format(variable)][msk]\n", + " vals_cand = yvals_f[\"cand_{}\".format(variable)][msk]\n", + " vals_mlpf = yvals_f[\"pred_{}\".format(variable)][msk]\n", + "\n", + " b = np.linspace(-2, 15, 100)\n", + "\n", + " reso_1 = (vals_cand - vals_gen) / vals_gen\n", + " reso_2 = (vals_mlpf - vals_gen) / vals_gen\n", + " plt.hist(\n", + " reso_1,\n", + " bins=bins,\n", + " histtype=\"step\",\n", + " lw=2,\n", + " label=\"PF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(reso_1), np.std(reso_1)),\n", + " )\n", + " plt.hist(\n", + " reso_2,\n", + " bins=bins,\n", + " histtype=\"step\",\n", + " lw=2,\n", + " label=\"MLPF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(reso_2), np.std(reso_2)),\n", + " )\n", + " plt.yscale(\"log\")\n", + " plt.xlabel(r\"$\\frac{E_\\mathrm{reco} - E_\\mathrm{gen}}{E_\\mathrm{gen}}$\")\n", + " plt.ylabel(\"Number of particles / bin\")\n", + " cms_label(ax)\n", + " sample_label(ax, \", ch. had.\")\n", + " plt.legend(loc=(0.4, 0.7))\n", + " plt.ylim(1, 1e9)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -578,31 +845,7 @@ "source": [ "pid = 1\n", "\n", - "fig = plt.figure()\n", - "ax = plt.axes()\n", - "\n", - "msk = (yvals_f[\"gen_cls_id\"] == pid) & (yvals_f[\"cand_cls_id\"] != 0) & (yvals_f[\"pred_cls_id\"] != 0)\n", - "vals_gen = yvals_f[\"gen_energy\"][msk]\n", - "vals_cand = yvals_f[\"cand_energy\"][msk]\n", - "vals_mlpf = yvals_f[\"pred_energy\"][msk]\n", - "\n", - "b = np.linspace(-2, 15, 100)\n", "\n", - "reso_1 = (vals_cand - vals_gen) / vals_gen\n", - "reso_2 = (vals_mlpf - vals_gen) / vals_gen\n", - "plt.hist(\n", - " reso_1, bins=b, histtype=\"step\", lw=2, label=\"PF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(reso_1), np.std(reso_1))\n", - ")\n", - "plt.hist(\n", - " reso_2, bins=b, histtype=\"step\", lw=2, label=\"MLPF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(reso_2), np.std(reso_2))\n", - ")\n", - "plt.yscale(\"log\")\n", - "plt.xlabel(r\"$\\frac{E_\\mathrm{reco} - E_\\mathrm{gen}}{E_\\mathrm{gen}}$\")\n", - "plt.ylabel(\"Number of particles / bin\")\n", - "cms_label(ax)\n", - "sample_label(ax, \", ch. had.\")\n", - "plt.legend(loc=(0.4, 0.7))\n", - "plt.ylim(1, 1e9)\n", "plt.savefig(\"{}/energy_res_ch_had.pdf\".format(outpath), bbox_inches=\"tight\")" ] }, @@ -976,6 +1219,43 @@ "plt.savefig(\"{}/eta_res_n_had.pdf\".format(outpath), bbox_inches=\"tight\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "afff4ce7", + "metadata": {}, + "outputs": [], + "source": [ + "pid = 5\n", + "\n", + "fig = plt.figure()\n", + "ax = plt.axes()\n", + "\n", + "msk = (yvals_f[\"gen_cls_id\"] == pid) & (yvals_f[\"cand_cls_id\"] == pid) & (yvals_f[\"pred_cls_id\"] == pid)\n", + "vals_gen = yvals_f[\"gen_eta\"][msk]\n", + "vals_cand = yvals_f[\"cand_eta\"][msk]\n", + "vals_mlpf = yvals_f[\"pred_eta\"][msk]\n", + "\n", + "b = np.linspace(-10, 10, 100)\n", + "\n", + "reso_1 = vals_cand - vals_gen\n", + "reso_2 = vals_mlpf - vals_gen\n", + "plt.hist(\n", + " reso_1, bins=b, histtype=\"step\", lw=2, label=\"PF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(reso_1), np.std(reso_1))\n", + ")\n", + "plt.hist(\n", + " reso_2, bins=b, histtype=\"step\", lw=2, label=\"MLPF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(reso_2), np.std(reso_2))\n", + ")\n", + "plt.yscale(\"log\")\n", + "plt.xlabel(r\"$\\eta_\\mathrm{reco} - \\eta_\\mathrm{gen}$\")\n", + "plt.ylabel(\"Number of particles / bin\")\n", + "cms_label(ax)\n", + "sample_label(ax, \", gamma\")\n", + "plt.legend(loc=(0.0, 0.7))\n", + "plt.ylim(1, 1e8)\n", + "plt.savefig(\"{}/eta_res_gamma.pdf\".format(outpath), bbox_inches=\"tight\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1426,7 +1706,7 @@ "metadata": {}, "outputs": [], "source": [ - "p0 = loss_plot(history[\"loss\"].values, history[\"val_loss\"].values, margin=0.01)\n", + "p0 = loss_plot(history[\"loss\"].values, history[\"val_loss\"].values, margin=0.1)\n", "plt.ylabel(\"Total loss\")\n", "plt.savefig(\"{}/loss.pdf\".format(outpath), bbox_inches=\"tight\")" ] @@ -1454,11 +1734,26 @@ "val_reg_loss = sum(\n", " [history[\"val_{}_loss\".format(l)].values for l in [\"energy\", \"pt\", \"eta\", \"sin_phi\", \"cos_phi\", \"charge\"]]\n", ")\n", - "p0 = loss_plot(reg_loss, val_reg_loss, margin=0.1)\n", + "p0 = loss_plot(reg_loss, val_reg_loss, margin=0.02)\n", "plt.ylabel(\"Regression loss\")\n", "plt.savefig(\"{}/reg_loss.pdf\".format(outpath), bbox_inches=\"tight\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "61d59797", + "metadata": {}, + "outputs": [], + "source": [ + "if \"pt_e_eta_phi_loss\" in history.keys():\n", + " reg_loss = sum([history[\"{}_loss\".format(l)].values for l in [\"pt_e_eta_phi\"]])\n", + " val_reg_loss = sum([history[\"val_{}_loss\".format(l)].values for l in [\"pt_e_eta_phi\"]])\n", + " p0 = loss_plot(reg_loss, val_reg_loss, margin=0.1)\n", + " plt.ylabel(\"Event loss\")\n", + " plt.savefig(\"{}/event_loss.pdf\".format(outpath), bbox_inches=\"tight\")" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/simvalidation.ipynb b/notebooks/simvalidation.ipynb index d3bae01d8..d2654e142 100644 --- a/notebooks/simvalidation.ipynb +++ b/notebooks/simvalidation.ipynb @@ -112,7 +112,7 @@ ")\n", "\n", "# pickle_data = sum([\n", - "# pickle.load(open(f, \"rb\")) for f in list(glob.glob(\"../mlpf/data/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi/1/pfntuple_1.pkl\"))], [])" + "# pickle.load(open(f, \"rb\")) for f in list(glob.glob(\"../mlpf/data_cms/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi/1/pfntuple_1.pkl\"))], [])" ] }, { diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml index a6589ba55..04b10c8ca 100644 --- a/parameters/cms-gen.yaml +++ b/parameters/cms-gen.yaml @@ -46,8 +46,8 @@ loss: eta_loss: type: Huber delta: 0.1 - event_loss: none #none, sliced_wasserstein, gen_jet, hist_2d - event_loss_coef: 0.0 + event_loss: gen_jet_logcosh #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 1.0 tensorflow: eager: no @@ -56,9 +56,9 @@ setup: train: yes weights: weights_config: - lr: 0.00002 - num_events_validation: 100 - num_epochs: 100 + lr: 0.001 + num_events_validation: 200 + num_epochs: 50 dtype: float32 trainable: classification_loss_type: sigmoid_focal_crossentropy @@ -135,8 +135,8 @@ parameters: normalize_degrees: yes activation: elu - num_graph_layers_id: 3 - num_graph_layers_reg: 3 + num_graph_layers_id: 2 + num_graph_layers_reg: 2 output_decoding: activation: elu regression_use_classification: yes @@ -218,7 +218,7 @@ raytune: train_test_datasets: physical: - batch_per_gpu: 5 + batch_per_gpu: 10 datasets: - cms_pf_ttbar - cms_pf_ztt diff --git a/parameters/test-eventloss/baseline-clspt.yaml b/parameters/test-eventloss/baseline-clspt.yaml deleted file mode 100644 index a6730445a..000000000 --- a/parameters/test-eventloss/baseline-clspt.yaml +++ /dev/null @@ -1,232 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7, tau=8) - num_output_classes: 9 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - cls_weight_by_pt: yes - -loss: - classification_loss_coef: 1.0 - charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - event_loss: none - event_loss_coef: 0.0 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 0.0005 - num_events_validation: 100 - num_epochs: 50 - dtype: float32 - trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # exponentialdecay, onecycle, none - optimizer: adam # adam, adamw, sgd - horovod_enabled: False - -optimizer: - adam: - amsgrad: no - #pcgrad does not work with LossScaleOptimizer, so it must be disabled for float16 - pcgrad: yes - adamw: - amsgrad: yes - weight_decay: 0.001 - sgd: - nesterov: no - momentum: 0.9 - -# LR Schedules -exponentialdecay: - decay_steps: 2000 - decay_rate: 0.99 - staircase: yes -onecycle: - mom_min: 0.85 - mom_max: 0.95 - warmup_ratio: 0.3 - div_factor: 25.0 - final_div: 100000.0 - -parameters: - model: gnn_dense - input_encoding: cms - node_update_mode: concat - do_node_encoding: no - node_encoding_hidden_dim: 128 - - combined_graph_layer: - bin_size: 100 - max_num_bins: 200 - distance_dim: 64 - layernorm: yes - dropout: 0.0 - dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 - - # MPNN - #kernel: - # type: NodePairTrainableKernel - # activation: elu - #num_node_messages: 1 - #node_message: - # type: NodeMessageLearnable - # output_dim: 64 - # hidden_dim: 128 - # num_layers: 2 - # activation: elu - #activation: elu - - # GCN - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - dist_norm: l2 - num_node_messages: 2 - node_message: - type: GHConvDense - output_dim: 128 - activation: elu - #if this is enabled, it will break float16 training - normalize_degrees: yes - activation: elu - - num_graph_layers_id: 2 - num_graph_layers_reg: 2 - output_decoding: - activation: elu - regression_use_classification: yes - dropout: 0.0 - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: yes - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -callbacks: - checkpoint: - monitor: "val_loss" - plot_freq: 1 - tensorboard: - dump_history: yes - hist_freq: 1 - -hypertune: - algorithm: hyperband # random, bayesian, hyperband - random: - objective: val_loss - max_trials: 100 - bayesian: - objective: val_loss - max_trials: 100 - num_initial_points: 2 - hyperband: - objective: val_loss - max_epochs: 10 - factor: 3 - iterations: 1 - executions_per_trial: 1 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_test_datasets: - physical: - batch_per_gpu: 5 - datasets: - - cms_pf_ttbar - -validation_datasets: - - cms_pf_ttbar - -datasets: - cms_pf_ttbar: - version: 1.4.0 - data_dir: - manual_dir: diff --git a/parameters/test-eventloss/baseline-mask_reg_cls0.yaml b/parameters/test-eventloss/baseline-mask_reg_cls0.yaml index 3a59e47c3..425d8446d 100644 --- a/parameters/test-eventloss/baseline-mask_reg_cls0.yaml +++ b/parameters/test-eventloss/baseline-mask_reg_cls0.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/baseline.yaml b/parameters/test-eventloss/baseline.yaml index 52dc9acef..520df0d7b 100644 --- a/parameters/test-eventloss/baseline.yaml +++ b/parameters/test-eventloss/baseline.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/genjet_logcosh.yaml b/parameters/test-eventloss/genjet_logcosh.yaml index 5f9120dcd..90d0fd3c0 100644 --- a/parameters/test-eventloss/genjet_logcosh.yaml +++ b/parameters/test-eventloss/genjet_logcosh.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/genjet_logcosh_mask_reg_cls0.yaml b/parameters/test-eventloss/genjet_logcosh_mask_reg_cls0.yaml index c326d1763..b52988379 100644 --- a/parameters/test-eventloss/genjet_logcosh_mask_reg_cls0.yaml +++ b/parameters/test-eventloss/genjet_logcosh_mask_reg_cls0.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/genjet_mse.yaml b/parameters/test-eventloss/genjet_mse.yaml index 9c5c7e70d..8f9b36e7d 100644 --- a/parameters/test-eventloss/genjet_mse.yaml +++ b/parameters/test-eventloss/genjet_mse.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/h2d.yaml b/parameters/test-eventloss/h2d.yaml index 71281af1e..efa4f0bf2 100644 --- a/parameters/test-eventloss/h2d.yaml +++ b/parameters/test-eventloss/h2d.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/swd.yaml b/parameters/test-eventloss/swd.yaml index c0cf78a2f..0a0c5084b 100644 --- a/parameters/test-eventloss/swd.yaml +++ b/parameters/test-eventloss/swd.yaml @@ -57,7 +57,7 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 100 + num_events_validation: 200 num_epochs: 50 dtype: float32 trainable: @@ -221,12 +221,27 @@ train_test_datasets: batch_per_gpu: 5 datasets: - cms_pf_ttbar + - cms_pf_ztt + - cms_pf_qcd + - cms_pf_qcd_high_pt validation_datasets: - - cms_pf_ttbar + - cms_pf_qcd_high_pt datasets: cms_pf_ttbar: version: 1.4.0 data_dir: manual_dir: + cms_pf_ztt: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd: + version: 1.4.0 + data_dir: + manual_dir: + cms_pf_qcd_high_pt: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh index 02e9e9c0e..8112ea1b9 100755 --- a/scripts/local_test_cms_pipeline.sh +++ b/scripts/local_test_cms_pipeline.sh @@ -17,7 +17,7 @@ cd ../../.. rm -Rf local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/raw mkdir -p local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/raw for file in `\ls -1 local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/root/*.root`; do - python mlpf/data/postprocessing2.py \ + python mlpf/data_cms/postprocessing2.py \ --input $file \ --outpath local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/raw \ --save-normalized-table --num-events 10 @@ -25,7 +25,7 @@ done mkdir -p experiments -tfds build hep_tfds/heptfds/cms_pf/ttbar --manual_dir ./local_test_data +tfds build mlpf/heptfds/cms_pf/ttbar --manual_dir ./local_test_data #Run a simple training on a few events python mlpf/pipeline.py train -c parameters/cms-gen.yaml --nepochs 1 --customize pipeline_test diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh index e95b78c02..878cb5700 100755 --- a/scripts/local_test_delphes_pipeline.sh +++ b/scripts/local_test_delphes_pipeline.sh @@ -11,7 +11,7 @@ mv tev14_pythia8_ttbar_0_0.pkl.bz2 data/delphes_pf/pythia8_ttbar/raw/ wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2 mv tev14_pythia8_qcd_10_0.pkl.bz2 data/delphes_pf/pythia8_qcd/val/ -tfds build hep_tfds/heptfds/delphes_pf --download_dir data/ +tfds build mlpf/heptfds/delphes_pf --download_dir data/ #Run a simple training on a few events python mlpf/pipeline.py train -c parameters/delphes.yaml --nepochs 1 --ntrain 5 --ntest 5 --customize pipeline_test