diff --git a/fcc/p8_ee_ZH_Htautau_ecm380.cmd b/fcc/p8_ee_ZH_Htautau_ecm380.cmd new file mode 100644 index 000000000..142564055 --- /dev/null +++ b/fcc/p8_ee_ZH_Htautau_ecm380.cmd @@ -0,0 +1,27 @@ +Random:setSeed = on +Main:numberOfEvents = 1000 ! number of events to generate +Main:timesAllowErrors = 5 ! how many aborts before run stops + +! 2) Settings related to output in init(), next() and stat(). +Init:showChangedSettings = on ! list changed settings +Init:showChangedParticleData = off ! list changed particle data +Next:numberCount = 100 ! print message every n events +Next:numberShowInfo = 1 ! print event information n times +Next:numberShowProcess = 1 ! print process record n times +Next:numberShowEvent = 0 ! print event record n times + +Beams:idA = 11 ! first beam, e+ = 11 +Beams:idB = -11 ! second beam, e- = -11 + +! 3) Hard process : ZH at 365 GeV +Beams:eCM = 380 ! CM energy of collision +HiggsSM:ffbar2HZ = on + +! 4) Settings for the event generation process in the Pythia8 library. +PartonLevel:ISR = on ! initial-state radiation +PartonLevel:FSR = on ! final-state radiation + +! 5) Non-standard settings; exemplifies tuning possibilities. +25:m0 = 125.0 ! Higgs mass +25:onMode = off +25:onIfAny = 15 \ No newline at end of file diff --git a/fcc/p8_ee_gg_ecm365.cmd b/fcc/p8_ee_gg_ecm365.cmd new file mode 100644 index 000000000..95c2a39a8 --- /dev/null +++ b/fcc/p8_ee_gg_ecm365.cmd @@ -0,0 +1,27 @@ +Random:setSeed = on +Main:numberOfEvents = 3000 ! number of events to generate +Main:timesAllowErrors = 5 ! how many aborts before run stops + + +! 2) Settings related to output in init(), next() and stat(). +Init:showChangedSettings = on ! list changed settings +Init:showChangedParticleData = off ! list changed particle data +Next:numberCount = 100 ! print message every n events +Next:numberShowInfo = 1 ! print event information n times +Next:numberShowProcess = 1 ! print process record n times +Next:numberShowEvent = 0 ! print event record n times +Stat:showPartonLevel = off + +! 3) Beam parameter settings. Values below agree with default ones. +Beams:idA = 11 ! first beam, e- = 11 +Beams:idB = -11 ! second beam, e+ = -11 + +! 4) Hard process : photon collisions at 365 +Beams:eCM = 365 +PhotonCollision:gmgm2qqbar = on +PhotonCollision:gmgm2ccbar = on +PhotonCollision:gmgm2bbbar = on + +PartonLevel:ISR = on ! initial-state radiation +PartonLevel:FSR = on ! final-state radiation + diff --git a/fcc/p8_ee_qcd_ecm365.cmd b/fcc/p8_ee_qcd_ecm365.cmd new file mode 100644 index 000000000..8d2bfb389 --- /dev/null +++ b/fcc/p8_ee_qcd_ecm365.cmd @@ -0,0 +1,27 @@ +Random:setSeed = on +Main:numberOfEvents = 6000 ! number of events to generate +Main:timesAllowErrors = 5 ! how many aborts before run stops + + +! 2) Settings related to output in init(), next() and stat(). +Init:showChangedSettings = on ! list changed settings +Init:showChangedParticleData = off ! list changed particle data +Next:numberCount = 100 ! print message every n events +Next:numberShowInfo = 100 ! print event information n times +Next:numberShowProcess = 100 ! print process record n times +Next:numberShowEvent = 100 ! print event record n times +Stat:showPartonLevel = off + +! 3) Beam parameter settings. +Beams:idA = 11 ! first beam, e- = 11 +Beams:idB = -11 ! second beam, e+ = -11 + +! s-channel gamma/Z +Beams:eCM = 365 ! CM energy of collision +HardQCD:all = on +WeakSingleBoson:ffbar2ffbar(s:gmZ) = on + +! Decay Z to quarks +23:onMode = off +23:onIfAny = 1 2 3 4 5 6 + diff --git a/fcc/p8_ee_qcd_ecm380.cmd b/fcc/p8_ee_qcd_ecm380.cmd new file mode 100644 index 000000000..35c64fae1 --- /dev/null +++ b/fcc/p8_ee_qcd_ecm380.cmd @@ -0,0 +1,22 @@ +Random:setSeed = on +Main:numberOfEvents = 6000 ! number of events to generate +Main:timesAllowErrors = 5 ! how many aborts before run stops + + +! 2) Settings related to output in init(), next() and stat(). +Init:showChangedSettings = on ! list changed settings +Init:showChangedParticleData = off ! list changed particle data +Next:numberCount = 100 ! print message every n events +Next:numberShowInfo = 100 ! print event information n times +Next:numberShowProcess = 100 ! print process record n times +Next:numberShowEvent = 100 ! print event record n times +Stat:showPartonLevel = off + +! 3) Beam parameter settings. +Beams:idA = 11 ! first beam, e- = 11 +Beams:idB = -11 ! second beam, e+ = -11 + +! s-channel gamma/Z +Beams:eCM = 380 ! CM energy of collision +HardQCD:all = on +WeakSingleBoson:ffbar2ffbar(s:gmZ) = on diff --git a/fcc/postprocessing.py b/fcc/postprocessing.py new file mode 100644 index 000000000..75daf8e76 --- /dev/null +++ b/fcc/postprocessing.py @@ -0,0 +1,759 @@ +import bz2 +import numpy as np +import awkward +import matplotlib.pyplot as plt +import uproot +import vector +import glob +import networkx as nx +import tqdm +import numba +import sys +from scipy.sparse import coo_matrix + +track_coll = "SiTracks_Refitted" +mc_coll = "MCParticles" + +#the feature matrices will be saved in this order +particle_feature_order = ["PDG", "charge", "pt", "eta", "phi", "energy"] + +#arrange track and cluster features such that pt (et), eta, phi, p (energy) are in the same spot +#so we can easily use them in skip connections +track_feature_order = [ + "type", "pt", "eta", "phi", "p", + "chi2", "ndf", "dEdx", "dEdxError", + "radiusOfInnermostHit", "tanLambda", "D0", "omega", + "Z0", "time" +] +cluster_feature_order = [ + "type", "et", "eta", "phi", "energy", + "position.x", "position.y", "position.z", "iTheta", + "energy_ecal", "energy_hcal", "energy_other", "num_hits", + "sigma_x", "sigma_y", "sigma_z" +] + +def track_pt(omega): + a = 3 * 10**-4 + b = 4 # B-field in tesla, from clicRec_e4h_input + + return a * np.abs(b / omega) + +def map_pdgid_to_candid(pdgid, charge): + if pdgid == 0: + return 0 + + #photon, electron, muon + if pdgid in [22, 11, 13]: + return pdgid + + # charged hadron + if abs(charge) > 0: + return 211 + + # neutral hadron + return 130 + +def map_charged_to_neutral(pdg): + if pdg == 0: + return 0 + if pdg == 11 or pdg == 22: + return 22 + return 130 + +def map_neutral_to_charged(pdg): + if pdg == 130 or pdg == 22: + return 211 + return pdg + +def sanitize(arr): + arr[np.isnan(arr)] = 0.0 + arr[np.isinf(arr)] = 0.0 + +class EventData: + def __init__(self, + gen_features, + hit_features, + cluster_features, + track_features, + genparticle_to_hit, + genparticle_to_track, + hit_to_cluster, + ): + self.gen_features = gen_features + self.hit_features = hit_features + self.cluster_features = cluster_features + self.track_features = track_features + self.genparticle_to_hit = genparticle_to_hit + self.genparticle_to_track = genparticle_to_track + self.hit_to_cluster = hit_to_cluster + +def get_cluster_subdet_energies(hit_list, hit_data, collectionIDs_reverse, iev): + """ + This function calculates the energy contribution from each of four subdetectors in a particle physics experiment, based on a list of hits and their corresponding data. + + Args: + hit_list: a list of tuples, where each tuple contains a collection ID and a hit index + hit_data: a dictionary containing data for each hit in the experiment, organized by collection + collectionIDs_reverse: a dictionary mapping collection IDs to collection names + iev: the event number for the current event + + Returns: + A tuple containing the energy contributions from each of the four subdetectors: + (ecal_energy, hcal_energy, muon_energy, other_energy) + """ + + ecal_energy = 0.0 + hcal_energy = 0.0 + muon_energy = 0.0 + other_energy = 0.0 + + for coll_id, hit_idx in hit_list: + coll = collectionIDs_reverse[coll_id] + hit_energy = hit_data[coll][iev][coll+".energy"][hit_idx] + + if coll.startswith("ECAL"): + ecal_energy += hit_energy + elif coll.startswith("HCAL"): + hcal_energy += hit_energy + elif coll == "MUON": + muon_energy += hit_energy + else: + other_energy += hit_energy + + return ecal_energy, hcal_energy, muon_energy, other_energy + +def hits_to_features(hit_data, iev, coll, feats): + feat_arr = {f: hit_data[coll + "." + f][iev] for f in feats} + + sdcoll = "subdetector" + feat_arr[sdcoll] = np.zeros(len(feat_arr["type"]), dtype=np.int32) + if coll.startswith("ECAL"): + feat_arr[sdcoll][:] = 0 + elif coll.startswith("HCAL"): + feat_arr[sdcoll][:] = 1 + else: + feat_arr[sdcoll][:] = 2 + return awkward.Record(feat_arr) + +def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev): + feats = ["type", "cellID", "energy", "energyError", "time", "position.x", "position.y", "position.z"] + + hit_idx_global = 0 + hit_idx_global_to_local = {} + hit_feature_matrix = [] + for col in sorted(hit_data.keys()): + icol = collectionIDs[col] + hit_features = hits_to_features(hit_data[col], iev, col, feats) + hit_feature_matrix.append(hit_features) + for ihit in range(len(hit_data[col][col+".energy"][iev])): + hit_idx_global_to_local[hit_idx_global] = (icol, ihit) + hit_idx_global += 1 + hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()} + hit_feature_matrix = awkward.Record({ + k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields}) + + #add all edges from genparticle to calohit + calohit_to_gen_weight = calohit_links["CalohitMCTruthLink"]["CalohitMCTruthLink.weight"][iev] + calohit_to_gen_calo_colid = calohit_links["CalohitMCTruthLink#0"]["CalohitMCTruthLink#0.collectionID"][iev] + calohit_to_gen_gen_colid = calohit_links["CalohitMCTruthLink#1"]["CalohitMCTruthLink#1.collectionID"][iev] + calohit_to_gen_calo_idx = calohit_links["CalohitMCTruthLink#0"]["CalohitMCTruthLink#0.index"][iev] + calohit_to_gen_gen_idx = calohit_links["CalohitMCTruthLink#1"]["CalohitMCTruthLink#1.index"][iev] + genparticle_to_hit_matrix_coo0 = [] + genparticle_to_hit_matrix_coo1 = [] + genparticle_to_hit_matrix_w = [] + for calo_colid, calo_idx, gen_colid, gen_idx, w in zip(calohit_to_gen_calo_colid, calohit_to_gen_calo_idx, calohit_to_gen_gen_colid, calohit_to_gen_gen_idx, calohit_to_gen_weight): + genparticle_to_hit_matrix_coo0.append(gen_idx) + genparticle_to_hit_matrix_coo1.append(hit_idx_local_to_global[(calo_colid, calo_idx)]) + genparticle_to_hit_matrix_w.append(w) + + return hit_feature_matrix, (genparticle_to_hit_matrix_coo0, genparticle_to_hit_matrix_coo1, genparticle_to_hit_matrix_w), hit_idx_local_to_global + +def hit_cluster_adj(prop_data, hit_idx_local_to_global, iev): + coll_arr = prop_data["PandoraClusters#1"]["PandoraClusters#1.collectionID"][iev] + idx_arr = prop_data["PandoraClusters#1"]["PandoraClusters#1.index"][iev] + hits_begin = prop_data["PandoraClusters"]["PandoraClusters.hits_begin"][iev] + hits_end = prop_data["PandoraClusters"]["PandoraClusters.hits_end"][iev] + + #index in the array of all hits + hit_to_cluster_matrix_coo0 = [] + #index in the cluster array + hit_to_cluster_matrix_coo1 = [] + + #weight + hit_to_cluster_matrix_w = [] + + #loop over all clusters + for icluster in range(len(hits_begin)): + + #get the slice in the hit array corresponding to this cluster + hbeg = hits_begin[icluster] + hend = hits_end[icluster] + idx_range = idx_arr[hbeg:hend] + coll_range = coll_arr[hbeg:hend] + + #add edges from hit to cluster + for icol, idx in zip(coll_range, idx_range): + hit_to_cluster_matrix_coo0.append(hit_idx_local_to_global[(icol, idx)]) + hit_to_cluster_matrix_coo1.append(icluster) + hit_to_cluster_matrix_w.append(1.0) + return hit_to_cluster_matrix_coo0, hit_to_cluster_matrix_coo1, hit_to_cluster_matrix_w + +def gen_to_features(prop_data, iev): + gen_arr = prop_data[mc_coll][iev] + gen_arr = {k.replace(mc_coll+".", ""): gen_arr[k] for k in gen_arr.fields} + + MCParticles_p4 = vector.awk(awkward.zip({ + "mass": gen_arr["mass"], + "x": gen_arr["momentum.x"], + "y": gen_arr["momentum.y"], + "z": gen_arr["momentum.z"]})) + gen_arr["pt"] = MCParticles_p4.pt + gen_arr["eta"] = MCParticles_p4.eta + gen_arr["phi"] = MCParticles_p4.phi + gen_arr["energy"] = MCParticles_p4.energy + + return awkward.Record({ + "PDG": gen_arr["PDG"], + "generatorStatus": gen_arr["generatorStatus"], + "charge": gen_arr["charge"], + "pt": gen_arr["pt"], + "eta": gen_arr["eta"], + "phi": gen_arr["phi"], + "energy": gen_arr["energy"], + }) + +def genparticle_track_adj(sitrack_links): + trk_to_gen_trkidx = sitrack_links["SiTracksMCTruthLink#0"]["SiTracksMCTruthLink#0.index"][iev] + trk_to_gen_genidx = sitrack_links["SiTracksMCTruthLink#1"]["SiTracksMCTruthLink#1.index"][iev] + trk_to_gen_w = sitrack_links["SiTracksMCTruthLink"]["SiTracksMCTruthLink.weight"][iev] + + genparticle_to_track_matrix_coo0 = awkward.to_numpy(trk_to_gen_genidx) + genparticle_to_track_matrix_coo1 = awkward.to_numpy(trk_to_gen_trkidx) + genparticle_to_track_matrix_w = awkward.to_numpy(trk_to_gen_w) + + return genparticle_to_track_matrix_coo0, genparticle_to_track_matrix_coo1, genparticle_to_track_matrix_w + +def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): + cluster_arr = prop_data["PandoraClusters"][iev] + feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy"] + ret = {feat: cluster_arr["PandoraClusters." + feat] for feat in feats} + + hit_idx = np.array(hit_to_cluster[0]) + cluster_idx = np.array(hit_to_cluster[1]) + cl_energy_ecal = [] + cl_energy_hcal = [] + cl_energy_other = [] + num_hits = [] + cl_sigma_x = [] + cl_sigma_y = [] + cl_sigma_z = [] + + n_cl = len(ret["energy"]) + for cl in range(n_cl): + msk_cl = cluster_idx == cl + hits = hit_idx[msk_cl] + + num_hits.append(len(hits)) + + subdets = hit_features["subdetector"][hits] + + hits_energy = hit_features["energy"][hits] + + hits_posx = hit_features["position.x"][hits] + hits_posy = hit_features["position.y"][hits] + hits_posz = hit_features["position.z"][hits] + + energy_ecal = np.sum(hits_energy[subdets==0]) + energy_hcal = np.sum(hits_energy[subdets==1]) + energy_other = np.sum(hits_energy[subdets==2]) + + cl_energy_ecal.append(energy_ecal) + cl_energy_hcal.append(energy_hcal) + cl_energy_other.append(energy_other) + + cl_sigma_x.append(np.std(hits_posx)) + cl_sigma_y.append(np.std(hits_posy)) + cl_sigma_z.append(np.std(hits_posz)) + + ret["energy_ecal"] = np.array(cl_energy_ecal) + ret["energy_hcal"] = np.array(cl_energy_hcal) + ret["energy_other"] = np.array(cl_energy_other) + ret["num_hits"] = np.array(num_hits) + ret["sigma_x"] = np.array(cl_sigma_x) + ret["sigma_y"] = np.array(cl_sigma_y) + ret["sigma_z"] = np.array(cl_sigma_z) + + tt = np.tan(ret["iTheta"] / 2.0) + eta = awkward.to_numpy(-np.log(tt, where=tt>0)) + eta[tt<=0] = 0.0 + ret["eta"] = eta + + costheta = np.cos(ret["iTheta"]) + ez = ret["energy"]*costheta + ret["et"] = np.sqrt(ret["energy"]**2 - ez**2) + + #override cluster type with 1 + ret["type"] = 2*np.ones(n_cl, dtype=np.float32) + + return awkward.Record(ret) + +def track_to_features(prop_data, iev): + track_arr = prop_data[track_coll][iev] + feats_from_track = ["type", "chi2", "ndf", "dEdx", "dEdxError", "radiusOfInnermostHit"] + ret = {feat: track_arr[track_coll + "." + feat] for feat in feats_from_track} + n_tr = len(ret["type"]) + + #FIXME: add additional track features from track state + + #get the index of the first track state + trackstate_idx = prop_data[track_coll][track_coll + ".trackStates_begin"][iev] + #get the properties of the track at the first track state (at the origin) + for k in ["tanLambda", "D0", "phi", "omega", "Z0", "time"]: + ret[k] = prop_data["SiTracks_1"]["SiTracks_1." + k][iev][trackstate_idx] + + ret["pt"] = track_pt(ret["omega"]) + ret["px"] = np.cos(ret["phi"]) * ret["pt"] + ret["py"] = np.sin(ret["phi"]) * ret["pt"] + ret["pz"] = ret["tanLambda"] * ret["pt"] + ret["p"] = np.sqrt(ret["px"]**2 + ret["py"]**2 + ret["pz"]**2) + cos_theta = np.divide(ret["pz"], ret["p"], where=ret["p"]>0) + theta = np.arccos(cos_theta) + tt = np.tan(theta / 2.0) + eta = awkward.to_numpy(-np.log(tt, where=tt>0)) + eta[tt<=0] = 0.0 + ret["eta"] = eta + + #override track type with 1 + ret["type"] = 1*np.ones(n_tr, dtype=np.float32) + + return awkward.Record(ret) + +def filter_adj(adj, all_to_filtered): + i0s_new = [] + i1s_new = [] + ws_new = [] + for i0, i1, w in zip(*adj): + if i0 in all_to_filtered: + i0_new = all_to_filtered[i0] + i0s_new.append(i0_new) + i1s_new.append(i1) + ws_new.append(w) + return np.array(i0s_new), np.array(i1s_new), np.array(ws_new) + +def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev): + gen_features = gen_to_features(prop_data, iev) + hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev) + hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev) + cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev) + track_features = track_to_features(prop_data, iev) + genparticle_to_track = genparticle_track_adj(sitrack_links) + + n_gp = awkward.count(gen_features["PDG"]) + n_track = awkward.count(track_features["type"]) + n_hit = awkward.count(hit_features["type"]) + n_cluster = awkward.count(cluster_features["type"]) + + if len(genparticle_to_track[0])>0: + gp_to_track = coo_matrix( + (genparticle_to_track[2], + (genparticle_to_track[0], genparticle_to_track[1])), + shape=(n_gp, n_track) + ).max(axis=1).todense() + else: + gp_to_track = np.zeros((n_gp, 1)) + + gp_to_calohit = coo_matrix( + (genparticle_to_hit[2], + (genparticle_to_hit[0], genparticle_to_hit[1])), + shape=(n_gp, n_hit) + ) + calohit_to_cluster = coo_matrix( + (hit_to_cluster[2], + (hit_to_cluster[0], hit_to_cluster[1])), + shape=(n_hit, n_cluster) + ) + gp_to_cluster = (gp_to_calohit*calohit_to_cluster).sum(axis=1) + + #60% of the hits of a track must come from the genparticle + gp_in_tracker = np.array(gp_to_track>=0.6)[:, 0] + + #at least 10% of the energy of the genparticle should be matched to a calorimeter cluster + gp_in_calo = (np.array(gp_to_cluster)[:, 0]/gen_features["energy"])>0.1 + + gp_interacted_with_detector = gp_in_tracker | gp_in_calo + + mask_visible = ( + (gen_features["generatorStatus"]==1) & + (gen_features["PDG"]!=12) & + (gen_features["PDG"]!=14) & + (gen_features["PDG"]!=16) & + (gen_features["energy"]>0.01) & + gp_interacted_with_detector + ) + idx_all_masked = np.where(mask_visible)[0] + genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} + + gen_features = awkward.Record({ + feat: gen_features[feat][mask_visible] for feat in gen_features.fields + }) + + genparticle_to_hit = filter_adj(genparticle_to_hit, genpart_idx_all_to_filtered) + genparticle_to_track = filter_adj(genparticle_to_track, genpart_idx_all_to_filtered) + + return EventData( + gen_features, + hit_features, + cluster_features, + track_features, + genparticle_to_hit, + genparticle_to_track, + hit_to_cluster + ) + +def assign_genparticles_to_obj_and_merge(gpdata): + + n_gp = awkward.count(gpdata.gen_features["PDG"]) + n_track = awkward.count(gpdata.track_features["type"]) + n_hit = awkward.count(gpdata.hit_features["type"]) + n_cluster = awkward.count(gpdata.cluster_features["type"]) + + gp_to_track = np.array(coo_matrix( + (gpdata.genparticle_to_track[2], + (gpdata.genparticle_to_track[0], gpdata.genparticle_to_track[1])), + shape=(n_gp, n_track) + ).todense()) + + gp_to_calohit = coo_matrix( + (gpdata.genparticle_to_hit[2], + (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), + shape=(n_gp, n_hit) + ) + calohit_to_cluster = coo_matrix( + (gpdata.hit_to_cluster[2], + (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), + shape=(n_hit, n_cluster) + ) + + gp_to_cluster = np.array((gp_to_calohit*calohit_to_cluster).todense()) + + #map each genparticle to a track or a cluster + gp_to_obj = -1*np.ones((n_gp, 2), dtype=np.int32) + set_used_tracks = set([]) + set_used_clusters = set([]) + gps_sorted_energy = sorted(range(n_gp), key=lambda x: gpdata.gen_features["energy"][x], reverse=True) + + for igp in gps_sorted_energy: + + #first check if we can match the genparticle to a track + matched_tracks = gp_to_track[igp] + trks = np.where(matched_tracks)[0] + trks = sorted(trks, key=lambda x: matched_tracks[x], reverse=True) + for trk in trks: + #if the track was not already used for something else + if trk not in set_used_tracks: + gp_to_obj[igp, 0] = trk + set_used_tracks.add(trk) + break + + #if there was no matched track, try a cluster + if gp_to_obj[igp, 0] == -1: + matched_clusters = gp_to_cluster[igp] + clusters = np.where(matched_clusters)[0] + clusters = sorted(clusters, key=lambda x: matched_clusters[x], reverse=True) + for cl in clusters: + if cl not in set_used_clusters: + gp_to_obj[igp, 1] = cl + set_used_clusters.add(cl) + break + + #the genparticles that could not be matched to a track or cluster are merged to the closest genparticle + unmatched = np.where((gp_to_obj[:, 0]==-1) & (gp_to_obj[:, 1]==-1))[0] + mask_gp_unmatched = np.ones(n_gp, dtype=bool) + + pt_arr = np.array(awkward.to_numpy(gpdata.gen_features["pt"])) + eta_arr = np.array(awkward.to_numpy(gpdata.gen_features["eta"])) + phi_arr = np.array(awkward.to_numpy(gpdata.gen_features["phi"])) + energy_arr = np.array(awkward.to_numpy(gpdata.gen_features["energy"])) + + #now merge unmatched genparticles to their closest genparticle + for igp_unmatched in unmatched: + mask_gp_unmatched[igp_unmatched] = False + idx_best_cluster = np.argmax(gp_to_cluster[igp_unmatched]) + idx_gp_bestcluster = np.where(gp_to_obj[:, 1]==idx_best_cluster)[0] + + #if the genparticle is not matched to any cluster, then it left a few hits to some other track + #this is rare, happens only for low-pT particles and we don"t want to try to reconstruct it + if (len(idx_gp_bestcluster)!=1): + print("unmatched pt=", pt_arr[igp_unmatched]) + continue + + idx_gp_bestcluster = idx_gp_bestcluster[0] + + vec0 = vector.obj( + pt=gpdata.gen_features["pt"][igp_unmatched], + eta=gpdata.gen_features["eta"][igp_unmatched], + phi=gpdata.gen_features["phi"][igp_unmatched], + e=gpdata.gen_features["energy"][igp_unmatched], + ) + vec1 = vector.obj( + pt=gpdata.gen_features["pt"][idx_gp_bestcluster], + eta=gpdata.gen_features["eta"][idx_gp_bestcluster], + phi=gpdata.gen_features["phi"][idx_gp_bestcluster], + e=gpdata.gen_features["energy"][idx_gp_bestcluster], + ) + vec = vec0+vec1 + pt_arr[idx_gp_bestcluster] = vec.pt + eta_arr[idx_gp_bestcluster] = vec.eta + phi_arr[idx_gp_bestcluster] = vec.phi + energy_arr[idx_gp_bestcluster] = vec.energy + + gen_features_new = { + "PDG": np.abs(gpdata.gen_features["PDG"][mask_gp_unmatched]), + "charge": gpdata.gen_features["charge"][mask_gp_unmatched], + "pt": pt_arr[mask_gp_unmatched], + "eta": eta_arr[mask_gp_unmatched], + "phi": phi_arr[mask_gp_unmatched], + "energy": energy_arr[mask_gp_unmatched], + } + assert((np.sum(gen_features_new["energy"])-np.sum(gpdata.gen_features["energy"])) < 1e-2) + + idx_all_masked = np.where(mask_gp_unmatched)[0] + genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} + genparticle_to_hit = filter_adj(gpdata.genparticle_to_hit, genpart_idx_all_to_filtered) + genparticle_to_track = filter_adj(gpdata.genparticle_to_track, genpart_idx_all_to_filtered) + gp_to_obj = gp_to_obj[mask_gp_unmatched] + + return EventData( + gen_features_new, + gpdata.hit_features, + gpdata.cluster_features, + gpdata.track_features, + genparticle_to_hit, + genparticle_to_track, + gpdata.hit_to_cluster + ), gp_to_obj + + +#for each PF element (track, cluster), get the index of the best-matched particle (gen or reco) +#if the PF element has no best-matched particle, returns -1 +def assign_to_recoobj(n_obj, obj_to_ptcl, used_particles): + obj_to_ptcl_all = -1 * np.ones(n_obj, dtype=np.int64) + for iobj in range(n_obj): + if iobj in obj_to_ptcl: + iptcl = obj_to_ptcl[iobj] + obj_to_ptcl_all[iobj] = iptcl + assert(used_particles[iptcl] == 0) + used_particles[iptcl] = 1 + return obj_to_ptcl_all + +def get_recoptcl_to_obj(n_rps, reco_arr): + track_to_rp = {} + cluster_to_rp = {} + for irp in range(n_rps): + assigned = False + trks_begin = reco_arr["tracks_begin"][irp] + trks_end = reco_arr["tracks_end"][irp] + for itrk in range(trks_begin, trks_end): + assert(itrk not in track_to_rp) + track_to_rp[itrk] = irp + assigned = True + + #only look for clusters if tracks were not found + if not assigned: + cls_begin = reco_arr["clusters_begin"][irp] + cls_end = reco_arr["clusters_end"][irp] + for icls in range(cls_begin, cls_end): + assert(icls not in cluster_to_rp) + cluster_to_rp[icls] = irp + return track_to_rp, cluster_to_rp + +def get_reco_properties(prop_data, iev): + reco_arr = prop_data["MergedRecoParticles"][iev] + reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields} + + reco_p4 = vector.awk(awkward.zip({ + "mass": reco_arr["mass"], + "x": reco_arr["momentum.x"], + "y": reco_arr["momentum.y"], + "z": reco_arr["momentum.z"]})) + reco_arr["pt"] = reco_p4.pt + reco_arr["eta"] = reco_p4.eta + reco_arr["phi"] = reco_p4.phi + reco_arr["energy"] = reco_p4.energy + + msk = reco_arr["type"]!=0 + reco_arr = awkward.Record({k: reco_arr[k][msk] for k in reco_arr.keys()}) + return reco_arr + +def get_particle_feature_matrix(pfelem_to_particle, feature_dict, features): + feats = [] + for feat in features: + feat_arr = feature_dict[feat] + if len(feat_arr)==0: + feat_arr_reordered = feat_arr + else: + feat_arr_reordered = awkward.to_numpy(feat_arr[pfelem_to_particle]) + feat_arr_reordered[pfelem_to_particle==-1] = 0.0 + feats.append(feat_arr_reordered) + feats = np.array(feats) + return feats.T + +def get_feature_matrix(feature_dict, features): + feats = [] + for feat in features: + feat_arr = awkward.to_numpy(feature_dict[feat]) + feats.append(feat_arr) + feats = np.array(feats) + return feats.T + +if __name__ == "__main__": + + fn = sys.argv[1] + fi = uproot.open(fn) + + arrs = fi["events"] + + collectionIDs = {k: v for k, v in + zip(fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_names"][0], + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_collectionIDs"][0])} + collectionIDs_reverse = {v: k for k, v in collectionIDs.items()} + + prop_data = arrs.arrays([mc_coll, track_coll, "SiTracks_1", "PandoraClusters", "PandoraClusters#1", "PandoraClusters#0", "MergedRecoParticles"]) + calohit_links = arrs.arrays(["CalohitMCTruthLink", "CalohitMCTruthLink#0", "CalohitMCTruthLink#1"]) + sitrack_links = arrs.arrays(["SiTracksMCTruthLink", "SiTracksMCTruthLink#0", "SiTracksMCTruthLink#1"]) + + hit_data = { + "ECALBarrel": arrs["ECALBarrel"].array(), + "ECALEndcap": arrs["ECALEndcap"].array(), + "ECALOther": arrs["ECALOther"].array(), + "HCALBarrel": arrs["HCALBarrel"].array(), + "HCALEndcap": arrs["HCALEndcap"].array(), + "HCALOther": arrs["HCALOther"].array(), + "MUON": arrs["MUON"].array(), + } + + ret = [] + for iev in tqdm.tqdm(range(arrs.num_entries)): + + #get the reco particles + reco_arr = get_reco_properties(prop_data, iev) + reco_type = np.abs(reco_arr["type"]) + n_rps = len(reco_type) + reco_features = awkward.Record({ + "PDG": np.abs(reco_type), + "charge": reco_arr["charge"], + "pt": reco_arr["pt"], + "eta": reco_arr["eta"], + "phi": reco_arr["phi"], + "energy": reco_arr["energy"] + }) + + #get the genparticles and the links between genparticles and tracks/clusters + gpdata = get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev) + + #find the reconstructable genparticles and associate them to the best track/cluster + gpdata_cleaned, gp_to_obj = assign_genparticles_to_obj_and_merge(gpdata) + + n_tracks = len(gpdata_cleaned.track_features["type"]) + n_clusters = len(gpdata_cleaned.cluster_features["type"]) + n_gps = len(gpdata_cleaned.gen_features["PDG"]) + + assert(len(gp_to_obj) == len(gpdata_cleaned.gen_features["PDG"])) + assert(gp_to_obj.shape[1] == 2) + + #for each reco particle, find the tracks and clusters associated with it + #construct track/cluster -> recoparticle maps + track_to_rp, cluster_to_rp = get_recoptcl_to_obj(n_rps, reco_arr) + + #get the track/cluster -> genparticle map + track_to_gp = {itrk: igp for igp, itrk in enumerate(gp_to_obj[:, 0]) if itrk != -1} + cluster_to_gp = {icl: igp for igp, icl in enumerate(gp_to_obj[:, 1]) if icl != -1} + + used_gps = np.zeros(n_gps, dtype=np.int64) + track_to_gp_all = assign_to_recoobj(n_tracks, track_to_gp, used_gps) + cluster_to_gp_all = assign_to_recoobj(n_clusters, cluster_to_gp, used_gps) + #all genparticles must be assigned to some PFElement + assert(np.all(used_gps == 1)) + + used_rps = np.zeros(n_rps, dtype=np.int64) + track_to_rp_all = assign_to_recoobj(n_tracks, track_to_rp, used_rps) + cluster_to_rp_all = assign_to_recoobj(n_clusters, cluster_to_rp, used_rps) + #all reco particles must be assigned to some PFElement + assert(np.all(used_rps == 1)) + + gps_track = get_particle_feature_matrix( + track_to_gp_all, + gpdata_cleaned.gen_features, + particle_feature_order + ) + gps_track[:, 0] = np.array([ + map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] + ) + gps_cluster = get_particle_feature_matrix( + cluster_to_gp_all, + gpdata_cleaned.gen_features, + particle_feature_order + ) + gps_cluster[:, 0] = np.array([ + map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])] + ) + gps_cluster[:, 1] = 0 + + rps_track = get_particle_feature_matrix( + track_to_rp_all, + reco_features, + particle_feature_order + ) + rps_track[:, 0] = np.array([ + map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] + ) + rps_cluster = get_particle_feature_matrix( + cluster_to_rp_all, + reco_features, + particle_feature_order + ) + rps_cluster[:, 0] = np.array([ + map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])] + ) + rps_cluster[:, 1] = 0 + + #all initial gen/reco particle energy must be reconstructable + assert(abs( + np.sum(gps_track[:, 5]) + np.sum(gps_cluster[:, 5]) - np.sum(gpdata_cleaned.gen_features["energy"]) + ) < 1e-2) + + assert(abs( + np.sum(rps_track[:, 5]) + np.sum(rps_cluster[:, 5]) - np.sum(reco_features["energy"]) + ) < 1e-2) + + + #we don"t want to try to reconstruct charged particles from primary clusters, make sure the charge is 0 + assert(np.all(gps_cluster[:, 1] == 0)) + assert(np.all(rps_cluster[:, 1] == 0)) + + X_track = get_feature_matrix(gpdata_cleaned.track_features, track_feature_order) + X_cluster = get_feature_matrix(gpdata_cleaned.cluster_features, cluster_feature_order) + ygen_track = gps_track + ygen_cluster = gps_cluster + ycand_track = rps_track + ycand_cluster = rps_cluster + + sanitize(X_track) + sanitize(X_cluster) + sanitize(ygen_track) + sanitize(ygen_cluster) + sanitize(ycand_track) + sanitize(ycand_cluster) + + this_ev = awkward.Record({ + "X_track": X_track, + "X_cluster": X_cluster, + "ygen_track": ygen_track, + "ygen_cluster": ygen_cluster, + "ycand_track": ycand_track, + "ycand_cluster": ycand_cluster + }) + ret.append(this_ev) + + ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields}) + awkward.to_parquet(ret, fn.replace(".root", ".parquet")) diff --git a/fcc/run_sim.sh b/fcc/run_sim.sh index 5a1c4dd52..361117d97 100755 --- a/fcc/run_sim.sh +++ b/fcc/run_sim.sh @@ -17,7 +17,11 @@ NUM=$1 #SAMPLE=p8_ee_Z_Ztautau_ecm125 #SAMPLE=p8_ee_tt_ecm365 -SAMPLE=p8_ee_ZZ_fullhad_ecm365 +#SAMPLE=p8_ee_ZZ_fullhad_ecm365 +#SAMPLE=p8_ee_qcd_ecm365 +#SAMPLE=p8_ee_qcd_ecm380 +#SAMPLE=p8_ee_ZH_Htautau_ecm380 +SAMPLE=p8_ee_qcd_ecm380 mkdir -p $OUTDIR/$SAMPLE diff --git a/mlpf/customizations.py b/mlpf/customizations.py index d8597c79b..7dde643cb 100644 --- a/mlpf/customizations.py +++ b/mlpf/customizations.py @@ -16,12 +16,12 @@ def customize_pipeline_test(config): config["evaluation_datasets"] = {"cms_pf_ttbar": {"batch_size": 2, "num_events": -1}} # For CLIC, keep only ttbar - if "clic_ttbar_pf" in config["datasets"]: - config["train_test_datasets"]["physical"]["datasets"] = ["clic_ttbar_pf"] + if "clic_edm_ttbar_pf" in config["datasets"]: + config["train_test_datasets"]["physical"]["datasets"] = ["clic_edm_ttbar_pf"] config["train_test_datasets"] = {"physical": config["train_test_datasets"]["physical"]} config["train_test_datasets"]["physical"]["batch_per_gpu"] = 50 - config["validation_dataset"] = "clic_ttbar_pf" - config["evaluation_datasets"] = {"clic_ttbar_pf": {"batch_size": 50, "num_events": -1}} + config["validation_dataset"] = "clic_edm_ttbar_pf" + config["evaluation_datasets"] = {"clic_edm_ttbar_pf": {"batch_size": 50, "num_events": -1}} # validate only on a small number of events config["validation_num_events"] = config["validation_batch_size"] * 2 diff --git a/mlpf/heptfds/clic_pf/higgsbb.py b/mlpf/heptfds/clic_pf/higgsbb.py index 7fc93212e..799cfc526 100644 --- a/mlpf/heptfds/clic_pf/higgsbb.py +++ b/mlpf/heptfds/clic_pf/higgsbb.py @@ -20,11 +20,14 @@ class ClicHiggsBbbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release.", + "1.1.0": "Fix postprocessing bug with charge", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic $MANUAL_DIR/ """ def _info(self) -> tfds.core.DatasetInfo: @@ -56,7 +59,8 @@ def _info(self) -> tfds.core.DatasetInfo: ) def _split_generators(self, dl_manager: tfds.download.DownloadManager): - return split_sample(Path("data/clic/gev380ee_pythia6_higgs_bbar_full201/")) + path = dl_manager.manual_dir + return split_sample(Path(path / "gev380ee_pythia6_higgs_bbar_full201")) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf/higgsgg.py b/mlpf/heptfds/clic_pf/higgsgg.py index 08fa20f2b..709e43bca 100644 --- a/mlpf/heptfds/clic_pf/higgsgg.py +++ b/mlpf/heptfds/clic_pf/higgsgg.py @@ -20,11 +20,14 @@ class ClicHiggsGgPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release.", + "1.1.0": "Fix postprocessing bug with charge", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic $MANUAL_DIR/ """ def _info(self) -> tfds.core.DatasetInfo: @@ -56,7 +59,8 @@ def _info(self) -> tfds.core.DatasetInfo: ) def _split_generators(self, dl_manager: tfds.download.DownloadManager): - return split_sample(Path("data/clic/gev380ee_pythia6_higgs_gamgam_full201/")) + path = dl_manager.manual_dir + return split_sample(Path(path / "gev380ee_pythia6_higgs_gamgam_full201")) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf/higgszz4l.py b/mlpf/heptfds/clic_pf/higgszz4l.py index d82ebe45a..94351aec6 100644 --- a/mlpf/heptfds/clic_pf/higgszz4l.py +++ b/mlpf/heptfds/clic_pf/higgszz4l.py @@ -20,11 +20,14 @@ class ClicHiggsZz4lPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release.", + "1.1.0": "Fix postprocessing bug with charge", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic $MANUAL_DIR/ """ def _info(self) -> tfds.core.DatasetInfo: @@ -56,7 +59,8 @@ def _info(self) -> tfds.core.DatasetInfo: ) def _split_generators(self, dl_manager: tfds.download.DownloadManager): - return split_sample(Path("data/clic/gev380ee_pythia6_higgs_zz_4l_full201/")) + path = dl_manager.manual_dir + return split_sample(Path(path / "gev380ee_pythia6_higgs_zz_4l_full201")) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf/qcd.py b/mlpf/heptfds/clic_pf/qcd.py index 78fd71c55..a571b25ef 100644 --- a/mlpf/heptfds/clic_pf/qcd.py +++ b/mlpf/heptfds/clic_pf/qcd.py @@ -20,11 +20,14 @@ class ClicQcdPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release.", + "1.1.0": "Fix postprocessing bug with charge", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic $MANUAL_DIR/ """ def _info(self) -> tfds.core.DatasetInfo: @@ -56,7 +59,8 @@ def _info(self) -> tfds.core.DatasetInfo: ) def _split_generators(self, dl_manager: tfds.download.DownloadManager): - return split_sample(Path("data/clic/gev380ee_pythia6_qcd_all_rfull201/")) + path = dl_manager.manual_dir + return split_sample(Path(path / "gev380ee_pythia6_qcd_all_rfull201")) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf/ttbar.py b/mlpf/heptfds/clic_pf/ttbar.py index 2e6182e49..150b9ed4a 100644 --- a/mlpf/heptfds/clic_pf/ttbar.py +++ b/mlpf/heptfds/clic_pf/ttbar.py @@ -20,11 +20,14 @@ class ClicTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release.", + "1.1.0": "Fix postprocessing bug with charge", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic $MANUAL_DIR/ """ def _info(self) -> tfds.core.DatasetInfo: @@ -56,7 +59,8 @@ def _info(self) -> tfds.core.DatasetInfo: ) def _split_generators(self, dl_manager: tfds.download.DownloadManager): - return split_sample(Path("data/clic/gev380ee_pythia6_ttbar_rfull201/")) + path = dl_manager.manual_dir + return split_sample(Path(path / "gev380ee_pythia6_ttbar_rfull201")) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf/zpoleee.py b/mlpf/heptfds/clic_pf/zpoleee.py index 38be9eb3c..19e57432a 100644 --- a/mlpf/heptfds/clic_pf/zpoleee.py +++ b/mlpf/heptfds/clic_pf/zpoleee.py @@ -20,11 +20,14 @@ class ClicZpoleeePf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release.", + "1.1.0": "Fix postprocessing bug with charge", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic $MANUAL_DIR/ """ def _info(self) -> tfds.core.DatasetInfo: @@ -56,7 +59,8 @@ def _info(self) -> tfds.core.DatasetInfo: ) def _split_generators(self, dl_manager: tfds.download.DownloadManager): - return split_sample(Path("data/clic/gev380ee_pythia6_zpole_ee_rfull201/")) + path = dl_manager.manual_dir + return split_sample(Path(path / "gev380ee_pythia6_zpole_ee_rfull201")) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/qcd.py b/mlpf/heptfds/clic_pf_edm4hep/qcd.py new file mode 100644 index 000000000..a720f12d7 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/qcd.py @@ -0,0 +1,62 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with ee -> gamma/Z* -> quarks +""" + +_CITATION = """ +""" + + +class ClicEdmQcdPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.0.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + return split_sample(Path("data/p8_ee_qcd_ecm365/")) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py new file mode 100644 index 000000000..e054e5a21 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -0,0 +1,62 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with ttbar +""" + +_CITATION = """ +""" + + +class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.0.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + return split_sample(Path("data/p8_ee_tt_ecm365/")) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py new file mode 100644 index 000000000..877446b86 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -0,0 +1,169 @@ +import awkward as ak +import fastjet +import numpy as np +import vector + +# from fcc/postprocessing.py +X_FEATURES_TRK = [ + "type", + "pt", + "eta", + "phi", + "p", + "chi2", + "ndf", + "dEdx", + "dEdxError", + "radiusOfInnermostHit", + "tanLambda", + "D0", + "omega", + "Z0", + "time", +] +X_FEATURES_CL = [ + "type", + "et", + "eta", + "phi", + "energy", + "position.x", + "position.y", + "position.z", + "iTheta", + "energy_ecal", + "energy_hcal", + "energy_other", + "num_hits", + "sigma_x", + "sigma_y", + "sigma_z", +] + +Y_FEATURES = ["PDG", "charge", "pt", "eta", "phi", "energy", "jet_idx"] +labels = [0, 211, 130, 22, 11, 13] + + +def split_sample(path, test_frac=0.8): + files = sorted(list(path.glob("*.parquet"))) + print("Found {} files in {}".format(files, path)) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + return { + "train": generate_examples(files_train), + "test": generate_examples(files_test), + } + + +def generate_examples(files): + jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4) + min_jet_pt = 1.0 # GeV + for fi in files: + ret = ak.from_parquet(fi) + + X_track = ret["X_track"] + X_cluster = ret["X_cluster"] + + assert len(X_track) == len(X_cluster) + nev = len(X_track) + + for iev in range(nev): + + X1 = ak.to_numpy(X_track[iev]) + X2 = ak.to_numpy(X_cluster[iev]) + + if len(X1) == 0 or len(X2) == 0: + continue + + ygen_track = ak.to_numpy(ret["ygen_track"][iev]) + ygen_cluster = ak.to_numpy(ret["ygen_cluster"][iev]) + ycand_track = ak.to_numpy(ret["ycand_track"][iev]) + ycand_cluster = ak.to_numpy(ret["ycand_cluster"][iev]) + + if len(ygen_track) == 0 or len(ygen_cluster) == 0: + continue + + # pad feature dim between tracks and clusters to the same size + if X1.shape[1] < X2.shape[1]: + X1 = np.pad(X1, [[0, 0], [0, X2.shape[1] - X1.shape[1]]]) + if X2.shape[1] < X1.shape[1]: + X2 = np.pad(X2, [[0, 0], [0, X1.shape[1] - X2.shape[1]]]) + + # concatenate tracks and clusters in features and targets + X = np.concatenate([X1, X2]) + ygen = np.concatenate([ygen_track, ygen_cluster]) + ycand = np.concatenate([ycand_track, ycand_cluster]) + + assert ygen.shape[0] == X.shape[0] + assert ycand.shape[0] == X.shape[0] + + # add jet_idx column + ygen = np.concatenate( + [ + ygen.astype(np.float32), + np.zeros((len(ygen), 1), dtype=np.float32), + ], + axis=-1, + ) + ycand = np.concatenate( + [ + ycand.astype(np.float32), + np.zeros((len(ycand), 1), dtype=np.float32), + ], + axis=-1, + ) + + # replace PID with index in labels array + arr = np.array([labels.index(p) for p in ygen[:, 0]]) + ygen[:, 0][:] = arr[:] + arr = np.array([labels.index(p) for p in ycand[:, 0]]) + ycand[:, 0][:] = arr[:] + + # prepare gen candidates for clustering + cls_id = ygen[..., 0] + valid = cls_id != 0 + # save mapping of index after masking -> index before masking as numpy array + # inspired from: + # https://stackoverflow.com/questions/432112/1044443#comment54747416_1044443 + cumsum = np.cumsum(valid) - 1 + _, index_mapping = np.unique(cumsum, return_index=True) + + pt = ygen[valid, Y_FEATURES.index("pt")] + eta = ygen[valid, Y_FEATURES.index("eta")] + phi = ygen[valid, Y_FEATURES.index("phi")] + energy = ygen[valid, Y_FEATURES.index("energy")] + vec = vector.awk(ak.zip({"pt": pt, "eta": eta, "phi": phi, "energy": energy})) + + # cluster jets, sort jet indices in descending order by pt + cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef) + jets = vector.awk(cluster.inclusive_jets(min_pt=min_jet_pt)) + sorted_jet_idx = ak.argsort(jets.pt, axis=-1, ascending=False).to_list() + # retrieve corresponding indices of constituents + constituent_idx = cluster.constituent_index(min_pt=min_jet_pt).to_list() + + # add index information to ygen and ycand + # index jets in descending order by pt starting from 1: + # 0 is null (unclustered), + # 1 is 1st highest-pt jet, + # 2 is 2nd highest-pt jet, ... + for jet_idx in sorted_jet_idx: + jet_constituents = [ + index_mapping[idx] for idx in constituent_idx[jet_idx] + ] # map back to constituent index *before* masking + ygen[jet_constituents, Y_FEATURES.index("jet_idx")] = jet_idx + 1 # jet index starts from 1 + ycand[jet_constituents, Y_FEATURES.index("jet_idx")] = jet_idx + 1 + + yield str(fi) + "_" + str(iev), { + "X": X.astype(np.float32), + "ygen": ygen, + "ycand": ycand, + } + + +if __name__ == "__main__": + for ex in generate_examples(["data/p8_ee_ZZ_fullhad_ecm365/reco_p8_ee_ZZ_fullhad_ecm365_1.parquet"]): + print(ex[0], ex[1]["X"].shape, ex[1]["ygen"].shape, ex[1]["ycand"].shape) diff --git a/mlpf/heptfds/clic_pf_edm4hep/zh_htautau.py b/mlpf/heptfds/clic_pf_edm4hep/zh_htautau.py new file mode 100644 index 000000000..579e4c690 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/zh_htautau.py @@ -0,0 +1,62 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with ZH, H->tautau +""" + +_CITATION = """ +""" + + +class ClicEdmZhHtautauPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.0.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + return split_sample(Path("data/p8_ee_ZH_Htautau_ecm380/")) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/zzfullhad.py b/mlpf/heptfds/clic_pf_edm4hep/zzfullhad.py new file mode 100644 index 000000000..8fb70abed --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/zzfullhad.py @@ -0,0 +1,62 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with ZZ->fullhadronic +""" + +_CITATION = """ +""" + + +class ClicEdmZzFullhadPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.0.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + return split_sample(Path("data/p8_ee_ZZ_fullhad_ecm365/")) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py index 2b5853a88..f2307c881 100644 --- a/mlpf/plotting/plot_utils.py +++ b/mlpf/plotting/plot_utils.py @@ -69,6 +69,9 @@ "cms_pf_qcd_high_pt": r"CMS high-$p_T$ QCD+PU events", "cms_pf_ttbar": r"CMS $\mathrm{t}\overline{\mathrm{t}}$+PU events", "cms_pf_single_neutron": r"CMS single neutron particle gun events", + "clic_edm_ttbar_pf": r"CLIC $ee \rightarrow \mathrm{t}\overline{\mathrm{t}}$", + "clic_edm_qcd_pf": r"CLIC $ee \rightarrow \gamma/\mathrm{Z}^* \rightarrow \mathrm{hadrons}$", + "clic_edm_zz_fullhad_pf": r"CLIC $ee \rightarrow \mathrm{ZZ} \rightarrow \mathrm{hadrons}$", } diff --git a/mlpf/tfmodel/datasets/BaseDatasetFactory.py b/mlpf/tfmodel/datasets/BaseDatasetFactory.py index 9cd37f2f1..1495bd6b4 100644 --- a/mlpf/tfmodel/datasets/BaseDatasetFactory.py +++ b/mlpf/tfmodel/datasets/BaseDatasetFactory.py @@ -53,20 +53,13 @@ def unpack_target_cms(y, num_output_classes, config): def unpack_target_clic(y, num_output_classes, config): msk_pid = tf.cast(y[..., 0:1] != 0, tf.float32) - px = y[..., 2:3] * msk_pid - py = y[..., 3:4] * msk_pid - pz = y[..., 4:5] * msk_pid + pt = y[..., 2:3] * msk_pid + eta = y[..., 3:4] * msk_pid + phi = y[..., 4:5] * msk_pid energy = y[..., 5:6] * msk_pid - pt = tf.math.sqrt(px**2 + py**2) * msk_pid - p = tf.math.sqrt(px**2 + py**2 + pz**2) * msk_pid - - cos_theta = tf.math.divide_no_nan(pz, p) - theta = tf.math.acos(cos_theta) - eta = -tf.math.log(tf.math.tan(theta / 2.0)) * msk_pid - - sin_phi = tf.math.divide_no_nan(py, pt) * msk_pid - cos_phi = tf.math.divide_no_nan(px, pt) * msk_pid + sin_phi = tf.math.sin(phi) * msk_pid + cos_phi = tf.math.cos(phi) * msk_pid ret = { "cls": tf.one_hot(tf.cast(y[..., 0], tf.int32), num_output_classes), @@ -81,7 +74,7 @@ def unpack_target_clic(y, num_output_classes, config): if config["loss"]["event_loss"] != "none": jet_idx = y[..., 6:7] * msk_pid pt_e_eta_phi = tf.concat([pt, energy, eta, sin_phi, cos_phi, jet_idx, msk_pid], axis=-1) - ret["pt_e_eta_phi"] = pt_e_eta_phi * msk_pid + ret["pt_e_eta_phi"] = pt_e_eta_phi if config["loss"]["met_loss"] != "none": px = pt * cos_phi @@ -122,6 +115,9 @@ def func(data_item): X = data_item["X"] y = data_item["y{}".format(target_particles)] + X = tf.clip_by_value(X, -1e12, 1e12) + y = tf.clip_by_value(y, -1e12, 1e12) + # mask to keep only nonzero (not zero-padded due to batching) elements msk_elems = tf.cast(X[..., 0:1] != 0, tf.float32) diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py index b64b028b1..7492be892 100644 --- a/mlpf/tfmodel/model.py +++ b/mlpf/tfmodel/model.py @@ -202,19 +202,7 @@ def call(self, X): # X[:, :, 1:] - all the other non-categorical features Xprop = X[:, :, 1:] - px = X[:, :, 1:2] - py = X[:, :, 2:3] - pz = X[:, :, 3:4] - pt = tf.math.sqrt(px**2 + py**2) - p = tf.math.sqrt(px**2 + py**2 + pz**2) - cos_theta = tf.math.divide_no_nan(pz, p) - theta = tf.math.acos(cos_theta) - eta = -tf.math.log(tf.math.tan(theta / 2.0)) - - sin_phi = tf.math.divide_no_nan(py, pt) - cos_phi = tf.math.divide_no_nan(px, pt) - - return tf.concat([Xid, pt, sin_phi, cos_phi, eta, Xprop], axis=-1) + return tf.concat([Xid, Xprop], axis=-1) """ @@ -876,32 +864,19 @@ def call(self, args, training=False): out_charge = self.ffn_charge(X_encoded, training=training) out_charge = out_charge * msk_input_outtype + orig_pt = tf.cast(X_input[:, :, 1:2], out_dtype) orig_eta = tf.cast(X_input[:, :, 2:3], out_dtype) # FIXME: better schema propagation between hep_tfds # skip connection from raw input values - if self.schema == "cms": + if self.schema == "cms" or self.schema == "clic": orig_sin_phi = tf.cast(tf.math.sin(X_input[:, :, 3:4]) * msk_input, out_dtype) orig_cos_phi = tf.cast(tf.math.cos(X_input[:, :, 3:4]) * msk_input, out_dtype) orig_energy = tf.cast(X_input[:, :, 4:5] * msk_input, out_dtype) - orig_pt = X_input[:, :, 1:2] elif self.schema == "delphes": orig_sin_phi = tf.cast(X_input[:, :, 3:4] * msk_input, out_dtype) orig_cos_phi = tf.cast(X_input[:, :, 4:5] * msk_input, out_dtype) orig_energy = tf.cast(X_input[:, :, 5:6] * msk_input, out_dtype) - orig_pt = X_input[:, :, 1:2] - elif self.schema == "clic": - px = X_input[:, :, 1:2] - py = X_input[:, :, 2:3] - pz = X_input[:, :, 3:4] - orig_pt = tf.math.sqrt(px**2 + py**2) - p = tf.math.sqrt(px**2 + py**2 + pz**2) - cos_theta = tf.math.divide_no_nan(pz, p) - theta = tf.math.acos(cos_theta) - orig_eta = -tf.math.log(tf.math.tan(theta / 2.0)) - orig_sin_phi = tf.math.divide_no_nan(py, orig_pt) - orig_cos_phi = tf.math.divide_no_nan(px, orig_pt) - orig_energy = p if self.regression_use_classification: X_encoded = tf.concat( @@ -920,9 +895,11 @@ def call(self, args, training=False): pred_eta = orig_eta + pred_eta_corr[:, :, 0:1] pred_sin_phi = orig_sin_phi + pred_phi_corr[:, :, 0:1] pred_cos_phi = orig_cos_phi + pred_phi_corr[:, :, 1:2] - # pred_eta = tf.clip_by_value(pred_eta, -7,7) - # pred_sin_phi = tf.clip_by_value(pred_sin_phi, -1,1) - # pred_cos_phi = tf.clip_by_value(pred_cos_phi, -1,1) + + # FIXME: check that this is helpful + pred_eta = tf.clip_by_value(pred_eta, -7, 7) + pred_sin_phi = tf.clip_by_value(pred_sin_phi, -1, 1) + pred_cos_phi = tf.clip_by_value(pred_cos_phi, -1, 1) X_encoded_energy = tf.concat([X_encoded, X_encoded_energy], axis=-1) if self.regression_use_classification: diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py index f18438746..818ed0b61 100644 --- a/mlpf/tfmodel/model_setup.py +++ b/mlpf/tfmodel/model_setup.py @@ -115,7 +115,7 @@ def epoch_end(self, epoch, logs, comet_experiment=None): N_jets = len(awkward.flatten(yvals["jets_gen_pt"])) N_jets_matched_pred = len(yvals["jet_gen_to_pred_genpt"]) for name, val in [ - ("jet_matched_frac", N_jets_matched_pred / N_jets), + ("jet_matched_frac", N_jets_matched_pred / N_jets if N_jets > 0 else float("nan")), ("jet_wd", jet_distances["wd"]), ("jet_iqr", jet_distances["iqr"]), ("jet_med", jet_distances["p50"]), @@ -486,6 +486,9 @@ def eval_model( energy = awkward.from_iter([np.array(v[m], np.float32) for v, m in zip(awkvals[typ]["energy"], valid)]) phi = awkward.from_iter([np.array(v[m], np.float32) for v, m in zip(phi, valid)]) + if verbose: + print(typ, pt) + # If there were no particles, build dummy arrays with the correct datatype if len(awkward.flatten(pt)) == 0: pt = build_dummy_array(len(pt), np.float64) diff --git a/notebooks/clic.ipynb b/notebooks/clic.ipynb index c3ca29d94..1d15f457a 100644 --- a/notebooks/clic.ipynb +++ b/notebooks/clic.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "foster-monte", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "ad0a7854", "metadata": { "tags": [ @@ -37,16 +37,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "935316c1", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:38<00:00, 38.65s/it]\n" + ] + } + ], "source": [ "# Load the datasets, process to flattened (X,ygen,ycand) format\n", "ret = []\n", - "filelist = list(glob.glob(\"{}/*.parquet\".format(path)))\n", + "filelist = list(glob.glob(\"{}/*.parquet\".format(path)))[:1]\n", "print(len(filelist))\n", "for fi in tqdm.tqdm(filelist):\n", " ret += postprocessing.prepare_data_clic(fi)" @@ -54,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "47c33634", "metadata": {}, "outputs": [], @@ -64,6 +79,27 @@ "ycand = awkward.from_iter([r[2] for r in ret])" ] }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ae5a2ea8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([0., 1., 2., 3., 4., 5.]), array([9625, 6801, 661, 4465, 166, 128]))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.unique(awkward.to_numpy(awkward.flatten(ygen[:, :, 0])), return_counts=True)" + ] + }, { "cell_type": "markdown", "id": "1a5ecc39", diff --git a/parameters/clic.yaml b/parameters/clic.yaml index ede8aaa11..e836893b0 100644 --- a/parameters/clic.yaml +++ b/parameters/clic.yaml @@ -3,13 +3,13 @@ backend: tensorflow dataset: schema: clic target_particles: gen - num_input_features: 12 + num_input_features: 16 num_output_features: 5 #(none=0, track=1, cluster=2) num_input_classes: 3 #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) num_output_classes: 6 - #(px, py, pz, E) + #(pt, eta, phi, E) num_momentum_outputs: 4 cls_weight_by_pt: no reg_weight_by_pt: no @@ -235,44 +235,34 @@ train_test_datasets: physical: batch_per_gpu: 1 datasets: - - clic_ttbar_pf - - clic_qcd_pf - - clic_higgs_bbbar_pf - - clic_higgs_gg_pf - - clic_higgs_zz4l_pf - - clic_zpoleee_pf + - clic_edm_ttbar_pf + - clic_edm_zz_fullhad_pf + - clic_edm_qcd_pf + - clic_edm_zh_htautau_pf -validation_dataset: clic_ttbar_pf +validation_dataset: clic_edm_ttbar_pf validation_batch_size: 50 validation_num_events: 500 evaluation_datasets: - clic_ttbar_pf: + clic_edm_ttbar_pf: batch_size: 50 num_events: -1 datasets: - clic_ttbar_pf: + clic_edm_ttbar_pf: version: 1.0.0 data_dir: manual_dir: - clic_qcd_pf: + clic_edm_zz_fullhad_pf: version: 1.0.0 data_dir: manual_dir: - clic_higgs_bbbar_pf: + clic_edm_qcd_pf: version: 1.0.0 data_dir: manual_dir: - clic_higgs_gg_pf: - version: 1.0.0 - data_dir: - manual_dir: - clic_higgs_zz4l_pf: - version: 1.0.0 - data_dir: - manual_dir: - clic_zpoleee_pf: + clic_edm_zh_htautau_pf: version: 1.0.0 data_dir: manual_dir: diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index f68b07e14..486c34787 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -1,10 +1,11 @@ #!/bin/bash # Tallinn -MANUAL_DIR=/local/joosep/mlpf/cms/v2/ -DATA_DIR=/scratch-persistent/joosep/tensorflow_datasets -IMG=/home/software/singularity/tf-2.10.0.simg -CMD="singularity exec -B /local -B /scratch-persistent--env PYTHONPATH=$PYTHONPATH $IMG tfds build " +export MANUAL_DIR=/local/joosep/mlpf/cms/v2 +export DATA_DIR=/scratch-persistent/joosep/tensorflow_datasets +export IMG=/home/software/singularity/tf-2.11.0.simg +export PYTHONPATH=`pwd`/mlpf +export CMD="singularity exec -B /local -B /scratch-persistent --env PYTHONPATH=$PYTHONPATH $IMG tfds build " # Desktop # IMG=/home/joosep/HEP-KBFI/singularity/tf-2.10.0.simg @@ -29,12 +30,11 @@ $CMD mlpf/heptfds/cms_pf/singletau --data_dir $DATA_DIR --manual_dir $MANUAL_DIR wait # CLIC -$CMD mlpf/heptfds/clic_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qcd.log & -$CMD mlpf/heptfds/clic_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & -$CMD mlpf/heptfds/clic_pf/zpoleee --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_zpoleee.log & -$CMD mlpf/heptfds/clic_pf/higgsbb --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_higgsbb.log & -$CMD mlpf/heptfds/clic_pf/higgszz4l --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_higgszz4l.log & -$CMD mlpf/heptfds/clic_pf/higgsgg --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_higgsgg.log & +export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep +$CMD mlpf/heptfds/clic_pf_edm4hep/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qcd.log & +$CMD mlpf/heptfds/clic_pf_edm4hep/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & +$CMD mlpf/heptfds/clic_pf_edm4hep/zh_htautau --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_zh_htautau.log & +$CMD mlpf/heptfds/clic_pf/zzfullhad --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_zzfullhad.log & wait $CMD mlpf/heptfds/delphes_pf/delphes_pf &> logs/tfds_delphes.log & diff --git a/scripts/local_test_clic_pipeline.sh b/scripts/local_test_clic_pipeline.sh index dcddc5bd1..bb3c57b6b 100755 --- a/scripts/local_test_clic_pipeline.sh +++ b/scripts/local_test_clic_pipeline.sh @@ -3,27 +3,27 @@ set -e export TFDS_DATA_DIR=`pwd`/tensorflow_datasets export PYTHONPATH=`pwd`/mlpf:$PYTHONPATH -rm -Rf data/clic/gev380ee_pythia6_ttbar_rfull201 -mkdir -p data/clic/gev380ee_pythia6_ttbar_rfull201 -cd data/clic/gev380ee_pythia6_ttbar_rfull201 +rm -Rf data/p8_ee_tt_ecm365 +mkdir -p data/p8_ee_tt_ecm365 +cd data/p8_ee_tt_ecm365 #download some test data -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic/gev380ee_pythia6_ttbar_rfull201/pythia6_ttbar_0001_pandora.parquet -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic/gev380ee_pythia6_ttbar_rfull201/pythia6_ttbar_0002_pandora.parquet -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic/gev380ee_pythia6_ttbar_rfull201/pythia6_ttbar_0003_pandora.parquet -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic/gev380ee_pythia6_ttbar_rfull201/pythia6_ttbar_0004_pandora.parquet -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic/gev380ee_pythia6_ttbar_rfull201/pythia6_ttbar_0005_pandora.parquet +wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic_edm4hep/reco_p8_ee_tt_ecm365_1.root +wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/clic_edm4hep/reco_p8_ee_tt_ecm365_2.root -cd ../../.. +cd ../.. -#run the clic data validation notebook -cd notebooks -papermill --inject-output-path --log-output -p path ../data/clic/gev380ee_pythia6_ttbar_rfull201/ clic.ipynb ./out.ipynb -cd .. +python3 fcc/postprocessing.py data/p8_ee_tt_ecm365/reco_p8_ee_tt_ecm365_1.root +python3 fcc/postprocessing.py data/p8_ee_tt_ecm365/reco_p8_ee_tt_ecm365_2.root -tfds build mlpf/heptfds/clic_pf/ttbar --manual_dir `pwd` +tfds build mlpf/heptfds/clic_pf_edm4hep/ttbar -#Train, evaluate and make plots +# #run the clic data validation notebook +# cd notebooks +# papermill --inject-output-path --log-output -p path ../data/clic/gev380ee_pythia6_ttbar_rfull201/ clic.ipynb ./out.ipynb +# cd .. + +# #Train, evaluate and make plots python mlpf/pipeline.py train --config parameters/clic.yaml --nepochs 1 --customize pipeline_test python mlpf/pipeline.py evaluate --nevents 5 --customize pipeline_test --train-dir ./experiments/clic* --weights ./experiments/clic*/weights/weights-01-*.hdf5 python mlpf/pipeline.py plots --train-dir ./experiments/clic*