diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 79b31baaa..ccea816fa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -43,6 +43,18 @@ jobs: - run: pip install -r requirements.txt - run: ./scripts/local_test_clic_pipeline.sh + tf-clic-hits-pipeline: + runs-on: ubuntu-20.04 + needs: [deps] + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.8.10' + cache: 'pip' + - run: pip install -r requirements.txt + - run: ./scripts/local_test_clic_hits_pipeline.sh + tf-delphes-pipeline: runs-on: ubuntu-20.04 needs: [deps] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2cd994fce..420e9b7de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ default_language_version: python: python3 -exclude: ^(delphes/tev14_pythia8_*.)|^(images/)|^(clic/dumper_hepsim.py)|^(mlpf/pyg/__init__.py)|^(fcc/.*) +exclude: ^(delphes/tev14_pythia8_*.)|^(images/)|^(clic/dumper_hepsim.py)|^(mlpf/pyg/__init__.py)|^(fcc/clicRec_e4h_input.py)|^(fcc/clic_steer.py)|^(fcc/pythia.py)|^(fcc/PandoraSettings) repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/fcc/check_files.py b/fcc/check_files.py index 51fa96d94..91f4e4893 100644 --- a/fcc/check_files.py +++ b/fcc/check_files.py @@ -1,28 +1,51 @@ import os -#check for file presence in this path -outpath = "/local/joosep/clic_edm4hep_2023_02_27/" +# check for file presence in this path +outpath = "/local/joosep/clic_edm4hep_2023_05_05/" -#pythia card, start seed, end seed +# pythia card, start seed, end seed samples = [ - ("p8_ee_tt_ecm380", 1, 10011), - ("p8_ee_qq_ecm380", 100001, 120011), + ("p8_ee_tt_ecm380", 1, 10011), + ("p8_ee_qq_ecm380", 100001, 120011), ("p8_ee_ZH_Htautau_ecm380", 200001, 210011), ("p8_ee_WW_fullhad_ecm380", 300001, 310011), ] samples_pu = [ - ("p8_ee_tt_ecm380", 1, 10001), + ("p8_ee_tt_ecm380", 1, 10001), +] + +samples_gun = [ + ("neutron", 1, 101), + ("kaon0L", 1, 101), + ("pi-", 1, 101), + ("pi+", 1, 101), + ("pi0", 1, 101), + ("mu-", 1, 101), + ("mu+", 1, 101), + ("e-", 1, 101), + ("e+", 1, 101), + ("gamma", 1, 101), ] if __name__ == "__main__": - #for sname, seed0, seed1 in samples: + # basic samples + # for sname, seed0, seed1 in samples: # for seed in range(seed0, seed1): # #check if output file exists, and print out batch submission if it doesn't # if not os.path.isfile("{}/{}/reco_{}_{}.root".format(outpath, sname, sname, seed)): - # print("sbatch run_sim.sh {} {}".format(seed, sname)) - for sname, seed0, seed1 in samples_pu: + # print("sbatch run_sim.sh {} {}".format(seed, sname)) + + # PU + # for sname, seed0, seed1 in samples_pu: + # for seed in range(seed0, seed1): + # # check if output file exists, and print out batch submission if it doesn't + # if not os.path.isfile("{}/{}_PU10/reco_{}_{}.root".format(outpath, sname, sname, seed)): + # print("sbatch run_sim_pu.sh {} {} p8_ee_gg_ecm380".format(seed, sname)) + + # gun + for sname, seed0, seed1 in samples_gun: for seed in range(seed0, seed1): - #check if output file exists, and print out batch submission if it doesn't - if not os.path.isfile("{}/{}_PU10/reco_{}_{}.root".format(outpath, sname, sname, seed)): - print("sbatch run_sim_pu.sh {} {} p8_ee_gg_ecm380".format(seed, sname)) + # check if output file exists, and print out batch submission if it doesn't + if not os.path.isfile("{}/{}/reco_{}_{}.root".format(outpath, sname, sname, seed)): + print("sbatch run_sim_gun.sh {} {}".format(seed, sname)) diff --git a/fcc/clicRec_e4h_input.py b/fcc/clicRec_e4h_input.py index 7e35ee274..d4e094cf4 100644 --- a/fcc/clicRec_e4h_input.py +++ b/fcc/clicRec_e4h_input.py @@ -2360,4 +2360,23 @@ from Configurables import ApplicationMgr -ApplicationMgr(TopAlg=algList, EvtSel="NONE", EvtMax=3, ExtSvc=[evtsvc], OutputLevel=WARNING) +SequencerTimerTool().OutputLevel = INFO +TIMER = TimingAuditor("TIMER") +TIMER.addTool(SequencerTimerTool, name="TIMER") +TIMER.TIMER.HistoProduce = True +TIMER.TIMER.OutputLevel = INFO + +toolsvc = ToolSvc() +auditorsvc = AuditorSvc() +auditorsvc.Auditors += [TIMER] +RootHistSvc().OutputFile = "timing_histos.root" + +ApplicationMgr( + TopAlg=algList, + EvtSel="NONE", + EvtMax=3, + ExtSvc=[evtsvc, toolsvc, auditorsvc], + OutputLevel=WARNING, + AuditAlgorithms=True, + HistogramPersistency = "ROOT") + diff --git a/fcc/clic_steer.py b/fcc/clic_steer.py index c2b584ef3..1e98b2ccd 100644 --- a/fcc/clic_steer.py +++ b/fcc/clic_steer.py @@ -139,7 +139,7 @@ ## Setting a distribution will set isotrop = True ## SIM.gun.distribution = None -SIM.gun.energy = 10000.0 +SIM.gun.energy = None ## isotropic distribution for the particle gun ## diff --git a/fcc/main19.cc b/fcc/main19.cc index ae792b5f3..9b3dd2f87 100644 --- a/fcc/main19.cc +++ b/fcc/main19.cc @@ -54,15 +54,15 @@ int main(int argc, char *argv[]) { std::cerr << "./main SEED NPU" << std::endl; return 1; } - + std::string seedStr = std::string("Random:seed = ").append(std::string(argv[1])); // Average number of pileup events per signal event. double nPileupAvg = atoi(argv[2]); - + // Shift each PU event by this time delta in time to mimic ee overlay double timeDelta = 0.5; - + Pythia8ToHepMC ToHepMC; ToHepMC.setNewFile("pythia.hepmc"); @@ -84,18 +84,18 @@ int main(int argc, char *argv[]) { // Select the number of pileup events to generate. int nPileup = poisson(nPileupAvg, pythiaPileup.rndm); - + // create a random index permutation from [0, nPileup) std::vector puVectorInds; for (int npu=0; npu0)) - eta[tt<=0] = 0.0 + eta = awkward.to_numpy(-np.log(tt, where=tt > 0)) + eta[tt <= 0] = 0.0 ret["eta"] = eta costheta = np.cos(ret["iTheta"]) - ez = ret["energy"]*costheta - ret["et"] = np.sqrt(ret["energy"]**2 - ez**2) + ez = ret["energy"] * costheta + ret["et"] = np.sqrt(ret["energy"] ** 2 - ez**2) + + # cluster is always type 2 + ret["elemtype"] = 2 * np.ones(n_cl, dtype=np.float32) - #override cluster type with 1 - ret["type"] = 2*np.ones(n_cl, dtype=np.float32) - ret["sin_phi"] = np.sin(ret["phi"]) ret["cos_phi"] = np.cos(ret["phi"]) return awkward.Record(ret) + def track_to_features(prop_data, iev): track_arr = prop_data[track_coll][iev] feats_from_track = ["type", "chi2", "ndf", "dEdx", "dEdxError", "radiusOfInnermostHit"] ret = {feat: track_arr[track_coll + "." + feat] for feat in feats_from_track} n_tr = len(ret["type"]) - #FIXME: add additional track features from track state - - #get the index of the first track state + # get the index of the first track state trackstate_idx = prop_data[track_coll][track_coll + ".trackStates_begin"][iev] - #get the properties of the track at the first track state (at the origin) + # get the properties of the track at the first track state (at the origin) for k in ["tanLambda", "D0", "phi", "omega", "Z0", "time"]: ret[k] = prop_data["SiTracks_1"]["SiTracks_1." + k][iev][trackstate_idx] @@ -320,22 +387,23 @@ def track_to_features(prop_data, iev): ret["px"] = np.cos(ret["phi"]) * ret["pt"] ret["py"] = np.sin(ret["phi"]) * ret["pt"] ret["pz"] = ret["tanLambda"] * ret["pt"] - ret["p"] = np.sqrt(ret["px"]**2 + ret["py"]**2 + ret["pz"]**2) - cos_theta = np.divide(ret["pz"], ret["p"], where=ret["p"]>0) + ret["p"] = np.sqrt(ret["px"] ** 2 + ret["py"] ** 2 + ret["pz"] ** 2) + cos_theta = np.divide(ret["pz"], ret["p"], where=ret["p"] > 0) theta = np.arccos(cos_theta) tt = np.tan(theta / 2.0) - eta = awkward.to_numpy(-np.log(tt, where=tt>0)) - eta[tt<=0] = 0.0 + eta = awkward.to_numpy(-np.log(tt, where=tt > 0)) + eta[tt <= 0] = 0.0 ret["eta"] = eta ret["sin_phi"] = np.sin(ret["phi"]) ret["cos_phi"] = np.cos(ret["phi"]) - #override track type with 1 - ret["type"] = 1*np.ones(n_tr, dtype=np.float32) + # track is always type 1 + ret["elemtype"] = 1 * np.ones(n_tr, dtype=np.float32) return awkward.Record(ret) + def filter_adj(adj, all_to_filtered): i0s_new = [] i1s_new = [] @@ -348,9 +416,12 @@ def filter_adj(adj, all_to_filtered): ws_new.append(w) return np.array(i0s_new), np.array(i1s_new), np.array(ws_new) + def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs): gen_features = gen_to_features(prop_data, iev) - hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs) + hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj( + hit_data, calohit_links, iev, collectionIDs + ) hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev) cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev) track_features = track_to_features(prop_data, iev) @@ -361,49 +432,33 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack n_hit = awkward.count(hit_features["type"]) n_cluster = awkward.count(cluster_features["type"]) - if len(genparticle_to_track[0])>0: - gp_to_track = coo_matrix( - (genparticle_to_track[2], - (genparticle_to_track[0], genparticle_to_track[1])), - shape=(n_gp, n_track) - ).max(axis=1).todense() + if len(genparticle_to_track[0]) > 0: + gp_to_track = ( + coo_matrix((genparticle_to_track[2], (genparticle_to_track[0], genparticle_to_track[1])), shape=(n_gp, n_track)) + .max(axis=1) + .todense() + ) else: gp_to_track = np.zeros((n_gp, 1)) - gp_to_calohit = coo_matrix( - (genparticle_to_hit[2], - (genparticle_to_hit[0], genparticle_to_hit[1])), - shape=(n_gp, n_hit) - ) - calohit_to_cluster = coo_matrix( - (hit_to_cluster[2], - (hit_to_cluster[0], hit_to_cluster[1])), - shape=(n_hit, n_cluster) - ) - gp_to_cluster = (gp_to_calohit*calohit_to_cluster).sum(axis=1) + gp_to_calohit = coo_matrix((genparticle_to_hit[2], (genparticle_to_hit[0], genparticle_to_hit[1])), shape=(n_gp, n_hit)) + calohit_to_cluster = coo_matrix((hit_to_cluster[2], (hit_to_cluster[0], hit_to_cluster[1])), shape=(n_hit, n_cluster)) + gp_to_cluster = (gp_to_calohit * calohit_to_cluster).sum(axis=1) - #60% of the hits of a track must come from the genparticle - gp_in_tracker = np.array(gp_to_track>=0.6)[:, 0] + # 60% of the hits of a track must come from the genparticle + gp_in_tracker = np.array(gp_to_track >= 0.6)[:, 0] - #at least 10% of the energy of the genparticle should be matched to a calorimeter cluster - gp_in_calo = (np.array(gp_to_cluster)[:, 0]/gen_features["energy"])>0.1 + # at least 10% of the energy of the genparticle should be matched to a calorimeter cluster + gp_in_calo = (np.array(gp_to_cluster)[:, 0] / gen_features["energy"]) > 0.1 gp_interacted_with_detector = gp_in_tracker | gp_in_calo - mask_visible = ( - (gen_features["generatorStatus"]==1) & - (gen_features["PDG"]!=12) & - (gen_features["PDG"]!=14) & - (gen_features["PDG"]!=16) & - (gen_features["energy"]>0.01) & - gp_interacted_with_detector - ) + mask_visible = (gen_features["energy"] > 0.01) & gp_interacted_with_detector + print("gps total={} visible={}".format(n_gp, np.sum(mask_visible))) idx_all_masked = np.where(mask_visible)[0] genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} - gen_features = awkward.Record({ - feat: gen_features[feat][mask_visible] for feat in gen_features.fields - }) + gen_features = awkward.Record({feat: gen_features[feat][mask_visible] for feat in gen_features.fields}) genparticle_to_hit = filter_adj(genparticle_to_hit, genpart_idx_all_to_filtered) genparticle_to_track = filter_adj(genparticle_to_track, genpart_idx_all_to_filtered) @@ -415,9 +470,11 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack track_features, genparticle_to_hit, genparticle_to_track, - hit_to_cluster + hit_to_cluster, + ([], []), ) + def assign_genparticles_to_obj_and_merge(gpdata): n_gp = awkward.count(gpdata.gen_features["PDG"]) @@ -425,45 +482,42 @@ def assign_genparticles_to_obj_and_merge(gpdata): n_hit = awkward.count(gpdata.hit_features["type"]) n_cluster = awkward.count(gpdata.cluster_features["type"]) - gp_to_track = np.array(coo_matrix( - (gpdata.genparticle_to_track[2], - (gpdata.genparticle_to_track[0], gpdata.genparticle_to_track[1])), - shape=(n_gp, n_track) - ).todense()) + gp_to_track = np.array( + coo_matrix( + (gpdata.genparticle_to_track[2], (gpdata.genparticle_to_track[0], gpdata.genparticle_to_track[1])), + shape=(n_gp, n_track), + ).todense() + ) gp_to_calohit = coo_matrix( - (gpdata.genparticle_to_hit[2], - (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), - shape=(n_gp, n_hit) + (gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit) ) calohit_to_cluster = coo_matrix( - (gpdata.hit_to_cluster[2], - (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), - shape=(n_hit, n_cluster) + (gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster) ) - gp_to_cluster = np.array((gp_to_calohit*calohit_to_cluster).todense()) + gp_to_cluster = np.array((gp_to_calohit * calohit_to_cluster).todense()) - #map each genparticle to a track or a cluster - gp_to_obj = -1*np.ones((n_gp, 2), dtype=np.int32) + # map each genparticle to a track or a cluster + gp_to_obj = -1 * np.ones((n_gp, 2), dtype=np.int32) set_used_tracks = set([]) set_used_clusters = set([]) gps_sorted_energy = sorted(range(n_gp), key=lambda x: gpdata.gen_features["energy"][x], reverse=True) for igp in gps_sorted_energy: - #first check if we can match the genparticle to a track + # first check if we can match the genparticle to a track matched_tracks = gp_to_track[igp] trks = np.where(matched_tracks)[0] trks = sorted(trks, key=lambda x: matched_tracks[x], reverse=True) for trk in trks: - #if the track was not already used for something else + # if the track was not already used for something else if trk not in set_used_tracks: gp_to_obj[igp, 0] = trk set_used_tracks.add(trk) break - #if there was no matched track, try a cluster + # if there was no matched track, try a cluster if gp_to_obj[igp, 0] == -1: matched_clusters = gp_to_cluster[igp] clusters = np.where(matched_clusters)[0] @@ -474,8 +528,8 @@ def assign_genparticles_to_obj_and_merge(gpdata): set_used_clusters.add(cl) break - #the genparticles that could not be matched to a track or cluster are merged to the closest genparticle - unmatched = np.where((gp_to_obj[:, 0]==-1) & (gp_to_obj[:, 1]==-1))[0] + # the genparticles that could not be matched to a track or cluster are merged to the closest genparticle + unmatched = np.where((gp_to_obj[:, 0] == -1) & (gp_to_obj[:, 1] == -1))[0] mask_gp_unmatched = np.ones(n_gp, dtype=bool) pt_arr = np.array(awkward.to_numpy(gpdata.gen_features["pt"])) @@ -483,20 +537,25 @@ def assign_genparticles_to_obj_and_merge(gpdata): phi_arr = np.array(awkward.to_numpy(gpdata.gen_features["phi"])) energy_arr = np.array(awkward.to_numpy(gpdata.gen_features["energy"])) - #now merge unmatched genparticles to their closest genparticle + # now merge unmatched genparticles to their closest genparticle + gp_merges_gp0 = [] + gp_merges_gp1 = [] for igp_unmatched in unmatched: mask_gp_unmatched[igp_unmatched] = False idx_best_cluster = np.argmax(gp_to_cluster[igp_unmatched]) - idx_gp_bestcluster = np.where(gp_to_obj[:, 1]==idx_best_cluster)[0] + idx_gp_bestcluster = np.where(gp_to_obj[:, 1] == idx_best_cluster)[0] - #if the genparticle is not matched to any cluster, then it left a few hits to some other track - #this is rare, happens only for low-pT particles and we don"t want to try to reconstruct it - if (len(idx_gp_bestcluster)!=1): + # if the genparticle is not matched to any cluster, then it left a few hits to some other track + # this is rare, happens only for low-pT particles and we don't want to try to reconstruct it + if len(idx_gp_bestcluster) != 1: print("unmatched pt=", pt_arr[igp_unmatched]) continue idx_gp_bestcluster = idx_gp_bestcluster[0] + gp_merges_gp0.append(idx_gp_bestcluster) + gp_merges_gp1.append(igp_unmatched) + vec0 = vector.obj( pt=gpdata.gen_features["pt"][igp_unmatched], eta=gpdata.gen_features["eta"][igp_unmatched], @@ -509,7 +568,7 @@ def assign_genparticles_to_obj_and_merge(gpdata): phi=gpdata.gen_features["phi"][idx_gp_bestcluster], e=gpdata.gen_features["energy"][idx_gp_bestcluster], ) - vec = vec0+vec1 + vec = vec0 + vec1 pt_arr[idx_gp_bestcluster] = vec.pt eta_arr[idx_gp_bestcluster] = vec.eta phi_arr[idx_gp_bestcluster] = vec.phi @@ -524,7 +583,7 @@ def assign_genparticles_to_obj_and_merge(gpdata): "cos_phi": np.cos(phi_arr[mask_gp_unmatched]), "energy": energy_arr[mask_gp_unmatched], } - assert((np.sum(gen_features_new["energy"])-np.sum(gpdata.gen_features["energy"])) < 1e-2) + assert (np.sum(gen_features_new["energy"]) - np.sum(gpdata.gen_features["energy"])) < 1e-2 idx_all_masked = np.where(mask_gp_unmatched)[0] genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} @@ -532,26 +591,30 @@ def assign_genparticles_to_obj_and_merge(gpdata): genparticle_to_track = filter_adj(gpdata.genparticle_to_track, genpart_idx_all_to_filtered) gp_to_obj = gp_to_obj[mask_gp_unmatched] - return EventData( - gen_features_new, - gpdata.hit_features, - gpdata.cluster_features, - gpdata.track_features, - genparticle_to_hit, - genparticle_to_track, - gpdata.hit_to_cluster - ), gp_to_obj + return ( + EventData( + gen_features_new, + gpdata.hit_features, + gpdata.cluster_features, + gpdata.track_features, + genparticle_to_hit, + genparticle_to_track, + gpdata.hit_to_cluster, + (gp_merges_gp0, gp_merges_gp1), + ), + gp_to_obj, + ) -#for each PF element (track, cluster), get the index of the best-matched particle (gen or reco) -#if the PF element has no best-matched particle, returns -1 +# for each PF element (track, cluster), get the index of the best-matched particle (gen or reco) +# if the PF element has no best-matched particle, returns -1 def assign_to_recoobj(n_obj, obj_to_ptcl, used_particles): obj_to_ptcl_all = -1 * np.ones(n_obj, dtype=np.int64) for iobj in range(n_obj): if iobj in obj_to_ptcl: iptcl = obj_to_ptcl[iobj] obj_to_ptcl_all[iobj] = iptcl - assert(used_particles[iptcl] == 0) + assert used_particles[iptcl] == 0 used_particles[iptcl] = 1 return obj_to_ptcl_all @@ -560,66 +623,69 @@ def get_recoptcl_to_obj(n_rps, reco_arr, idx_rp_to_track, idx_rp_to_cluster): track_to_rp = {} cluster_to_rp = {} - #loop over the reco particles + # loop over the reco particles for irp in range(n_rps): assigned = False - #find and loop over tracks associated to the reco particle + # find and loop over tracks associated to the reco particle trks_begin = reco_arr["tracks_begin"][irp] trks_end = reco_arr["tracks_end"][irp] for itrk in range(trks_begin, trks_end): - #get the index of the track in the track collection + # get the index of the track in the track collection itrk_real = idx_rp_to_track[itrk] - assert(itrk_real not in track_to_rp) + assert itrk_real not in track_to_rp track_to_rp[itrk_real] = irp assigned = True - #only look for clusters if tracks were not found + # only look for clusters if tracks were not found if not assigned: - #find and loop over clusters associated to the reco particle + # find and loop over clusters associated to the reco particle cls_begin = reco_arr["clusters_begin"][irp] cls_end = reco_arr["clusters_end"][irp] for icls in range(cls_begin, cls_end): - #get the index of the cluster in the cluster collection + # get the index of the cluster in the cluster collection icls_real = idx_rp_to_cluster[icls] - assert(icls_real not in cluster_to_rp) + assert icls_real not in cluster_to_rp cluster_to_rp[icls_real] = irp return track_to_rp, cluster_to_rp + def get_reco_properties(prop_data, iev): reco_arr = prop_data["MergedRecoParticles"][iev] reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields} - reco_p4 = vector.awk(awkward.zip({ - "mass": reco_arr["mass"], - "x": reco_arr["momentum.x"], - "y": reco_arr["momentum.y"], - "z": reco_arr["momentum.z"]})) + reco_p4 = vector.awk( + awkward.zip( + {"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]} + ) + ) reco_arr["pt"] = reco_p4.pt reco_arr["eta"] = reco_p4.eta reco_arr["phi"] = reco_p4.phi reco_arr["energy"] = reco_p4.energy - msk = reco_arr["type"]!=0 + msk = reco_arr["type"] != 0 reco_arr = awkward.Record({k: reco_arr[k][msk] for k in reco_arr.keys()}) return reco_arr + def get_particle_feature_matrix(pfelem_to_particle, feature_dict, features): feats = [] for feat in features: feat_arr = feature_dict[feat] - if len(feat_arr)==0: + if len(feat_arr) == 0: feat_arr_reordered = feat_arr else: feat_arr_reordered = awkward.to_numpy(feat_arr[pfelem_to_particle]) - feat_arr_reordered[pfelem_to_particle==-1] = 0.0 + feat_arr_reordered[pfelem_to_particle == -1] = 0.0 feats.append(feat_arr_reordered) feats = np.array(feats) return feats.T + def get_feature_matrix(feature_dict, features): feats = [] for feat in features: @@ -628,26 +694,42 @@ def get_feature_matrix(feature_dict, features): feats = np.array(feats) return feats.T + def process_one_file(fn, ofn): - #output exists, do not recreate + # output exists, do not recreate if os.path.isfile(ofn): + print("{} exists".format(ofn)) return fi = uproot.open(fn) - + arrs = fi["events"] - - collectionIDs = {k: v for k, v in - zip(fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_names"][0], - fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_collectionIDs"][0])} - collectionIDs_reverse = {v: k for k, v in collectionIDs.items()} - - prop_data = arrs.arrays([mc_coll, track_coll, "SiTracks_1", "PandoraClusters", "PandoraClusters#1", "PandoraClusters#0", "MergedRecoParticles"]) + + collectionIDs = { + k: v + for k, v in zip( + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_names"][0], + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_collectionIDs"][0], + ) + } + + prop_data = arrs.arrays( + [ + mc_coll, + track_coll, + "SiTracks_1", + "PandoraClusters", + "PandoraClusters#1", + "PandoraClusters#0", + "MergedRecoParticles", + ] + ) calohit_links = arrs.arrays(["CalohitMCTruthLink", "CalohitMCTruthLink#0", "CalohitMCTruthLink#1"]) sitrack_links = arrs.arrays(["SiTracksMCTruthLink", "SiTracksMCTruthLink#0", "SiTracksMCTruthLink#1"]) - #maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) to the index in the track/cluster collection + # maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) + # to the index in the track/cluster collection idx_rp_to_cluster = arrs["MergedRecoParticles#0/MergedRecoParticles#0.index"].array() idx_rp_to_track = arrs["MergedRecoParticles#1/MergedRecoParticles#1.index"].array() @@ -662,104 +744,87 @@ def process_one_file(fn, ofn): } ret = [] - for iev in tqdm.tqdm(range(arrs.num_entries)): + for iev in range(arrs.num_entries): - #get the reco particles + # get the reco particles reco_arr = get_reco_properties(prop_data, iev) reco_type = np.abs(reco_arr["type"]) n_rps = len(reco_type) - reco_features = awkward.Record({ - "PDG": np.abs(reco_type), - "charge": reco_arr["charge"], - "pt": reco_arr["pt"], - "eta": reco_arr["eta"], - "sin_phi": np.sin(reco_arr["phi"]), - "cos_phi": np.cos(reco_arr["phi"]), - "energy": reco_arr["energy"] - }) - - #get the genparticles and the links between genparticles and tracks/clusters + reco_features = awkward.Record( + { + "PDG": np.abs(reco_type), + "charge": reco_arr["charge"], + "pt": reco_arr["pt"], + "eta": reco_arr["eta"], + "sin_phi": np.sin(reco_arr["phi"]), + "cos_phi": np.cos(reco_arr["phi"]), + "energy": reco_arr["energy"], + } + ) + + # get the genparticles and the links between genparticles and tracks/clusters gpdata = get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs) - #find the reconstructable genparticles and associate them to the best track/cluster + # find the reconstructable genparticles and associate them to the best track/cluster gpdata_cleaned, gp_to_obj = assign_genparticles_to_obj_and_merge(gpdata) n_tracks = len(gpdata_cleaned.track_features["type"]) n_clusters = len(gpdata_cleaned.cluster_features["type"]) n_gps = len(gpdata_cleaned.gen_features["PDG"]) - assert(len(gp_to_obj) == len(gpdata_cleaned.gen_features["PDG"])) - assert(gp_to_obj.shape[1] == 2) - - #for each reco particle, find the tracks and clusters associated with it - #construct track/cluster -> recoparticle maps + assert len(gp_to_obj) == len(gpdata_cleaned.gen_features["PDG"]) + assert gp_to_obj.shape[1] == 2 + + # for each reco particle, find the tracks and clusters associated with it + # construct track/cluster -> recoparticle maps track_to_rp, cluster_to_rp = get_recoptcl_to_obj(n_rps, reco_arr, idx_rp_to_track[iev], idx_rp_to_cluster[iev]) - #get the track/cluster -> genparticle map + # get the track/cluster -> genparticle map track_to_gp = {itrk: igp for igp, itrk in enumerate(gp_to_obj[:, 0]) if itrk != -1} cluster_to_gp = {icl: igp for igp, icl in enumerate(gp_to_obj[:, 1]) if icl != -1} used_gps = np.zeros(n_gps, dtype=np.int64) track_to_gp_all = assign_to_recoobj(n_tracks, track_to_gp, used_gps) cluster_to_gp_all = assign_to_recoobj(n_clusters, cluster_to_gp, used_gps) - #all genparticles must be assigned to some PFElement - assert(np.all(used_gps == 1)) + # all genparticles must be assigned to some PFElement + assert np.all(used_gps == 1) used_rps = np.zeros(n_rps, dtype=np.int64) track_to_rp_all = assign_to_recoobj(n_tracks, track_to_rp, used_rps) cluster_to_rp_all = assign_to_recoobj(n_clusters, cluster_to_rp, used_rps) - #all reco particles must be assigned to some PFElement - assert(np.all(used_rps == 1)) + # all reco particles must be assigned to some PFElement + assert np.all(used_rps == 1) - gps_track = get_particle_feature_matrix( - track_to_gp_all, - gpdata_cleaned.gen_features, - particle_feature_order - ) - gps_track[:, 0] = np.array([ - map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] + gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) + gps_track[:, 0] = np.array( + [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] ) - gps_cluster = get_particle_feature_matrix( - cluster_to_gp_all, - gpdata_cleaned.gen_features, - particle_feature_order - ) - gps_cluster[:, 0] = np.array([ - map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])] + gps_cluster = get_particle_feature_matrix(cluster_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) + gps_cluster[:, 0] = np.array( + [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])] ) gps_cluster[:, 1] = 0 - rps_track = get_particle_feature_matrix( - track_to_rp_all, - reco_features, - particle_feature_order - ) - rps_track[:, 0] = np.array([ - map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] + rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order) + rps_track[:, 0] = np.array( + [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] ) - rps_cluster = get_particle_feature_matrix( - cluster_to_rp_all, - reco_features, - particle_feature_order - ) - rps_cluster[:, 0] = np.array([ - map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])] + rps_cluster = get_particle_feature_matrix(cluster_to_rp_all, reco_features, particle_feature_order) + rps_cluster[:, 0] = np.array( + [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])] ) rps_cluster[:, 1] = 0 - #all initial gen/reco particle energy must be reconstructable - assert(abs( - np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"]) - ) < 1e-2) - - assert(abs( - np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"]) - ) < 1e-2) + # all initial gen/reco particle energy must be reconstructable + assert ( + abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2 + ) + assert abs(np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"])) < 1e-2 - #we don"t want to try to reconstruct charged particles from primary clusters, make sure the charge is 0 - assert(np.all(gps_cluster[:, 1] == 0)) - assert(np.all(rps_cluster[:, 1] == 0)) + # we don"t want to try to reconstruct charged particles from primary clusters, make sure the charge is 0 + assert np.all(gps_cluster[:, 1] == 0) + assert np.all(rps_cluster[:, 1] == 0) X_track = get_feature_matrix(gpdata_cleaned.track_features, track_feature_order) X_cluster = get_feature_matrix(gpdata_cleaned.cluster_features, cluster_feature_order) @@ -775,27 +840,30 @@ def process_one_file(fn, ofn): sanitize(ycand_track) sanitize(ycand_cluster) - this_ev = awkward.Record({ - "X_track": X_track, - "X_cluster": X_cluster, - "ygen_track": ygen_track, - "ygen_cluster": ygen_cluster, - "ycand_track": ycand_track, - "ycand_cluster": ycand_cluster - }) + this_ev = awkward.Record( + { + "X_track": X_track, + "X_cluster": X_cluster, + "ygen_track": ygen_track, + "ygen_cluster": ygen_cluster, + "ycand_track": ycand_track, + "ycand_cluster": ycand_cluster, + } + ) ret.append(this_ev) ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields}) awkward.to_parquet(ret, ofn) + def process_sample(sample): inp = "/local/joosep/clic_edm4hep_2023_02_27/" - outp = "/local/joosep/mlpf/clic_edm4hep_2023_04_27/" + outp = "/local/joosep/mlpf/clic_edm4hep_2023_05_09/" - pool = multiprocessing.Pool(30) + pool = multiprocessing.Pool(16) - inpath_samp = inp + samp - outpath_samp = outp + samp + inpath_samp = inp + sample + outpath_samp = outp + sample infiles = list(glob.glob(inpath_samp + "/*.root")) if not os.path.isdir(outpath_samp): os.makedirs(outpath_samp) @@ -806,6 +874,7 @@ def process_sample(sample): args.append((inf, of)) pool.starmap(process_one_file, args) + if __name__ == "__main__": if len(sys.argv) == 2: process_sample(sys.argv[1]) diff --git a/fcc/postprocessing_hits.py b/fcc/postprocessing_hits.py index f804f5fbe..3009fa90e 100644 --- a/fcc/postprocessing_hits.py +++ b/fcc/postprocessing_hits.py @@ -1,38 +1,57 @@ -import bz2 import numpy as np import awkward -import matplotlib.pyplot as plt import uproot -import vector import glob -import networkx as nx -import tqdm -import numba import os import sys import multiprocessing from scipy.sparse import coo_matrix -track_coll = "SiTracks_Refitted" -mc_coll = "MCParticles" +from postprocessing import map_pdgid_to_candid, map_charged_to_neutral, map_neutral_to_charged, sanitize -#the feature matrices will be saved in this order -particle_feature_order = ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy"] +from postprocessing import track_coll, mc_coll, particle_feature_order -#arrange track and cluster features such that pt (et), eta, phi, p (energy) are in the same spot -#so we can easily use them in skip connections track_feature_order = [ - "elemtype", "pt", "eta", "sin_phi", "cos_phi", "p", - "chi2", "ndf", - "radiusOfInnermostHit", "tanLambda", "D0", "omega", - "referencePoint.x", "referencePoint.y", "referencePoint.z", - "Z0", "time", "type" + "elemtype", + "pt", + "eta", + "sin_phi", + "cos_phi", + "p", + "chi2", + "ndf", + "radiusOfInnermostHit", + "tanLambda", + "D0", + "omega", + "Z0", + "time", + "type", ] hit_feature_order = [ - "elemtype", "et", "eta", "sin_phi", "cos_phi", "energy", - "position.x", "position.y", "position.z", "time", "subdetector", "type" + "elemtype", + "et", + "eta", + "sin_phi", + "cos_phi", + "energy", + "position.x", + "position.y", + "position.z", + "time", + "subdetector", + "type", ] +from postprocessing import ( + get_genparticles_and_adjacencies, + assign_to_recoobj, + get_reco_properties, + get_particle_feature_matrix, + get_feature_matrix, +) + + def build_dummy_array(num, dtype=np.int64): return awkward.Array( awkward.contents.ListOffsetArray( @@ -41,373 +60,46 @@ def build_dummy_array(num, dtype=np.int64): ) ) -def track_pt(omega): - a = 3 * 10**-4 - b = 4 # B-field in tesla, from clicRec_e4h_input - - return a * np.abs(b / omega) - -def map_pdgid_to_candid(pdgid, charge): - if pdgid == 0: - return 0 - - #photon, electron, muon - if pdgid in [22, 11, 13]: - return pdgid - - # charged hadron - if abs(charge) > 0: - return 211 - - # neutral hadron - return 130 - -def map_charged_to_neutral(pdg): - if pdg == 0: - return 0 - if pdg == 11 or pdg == 22: - return 22 - return 130 - -def map_neutral_to_charged(pdg): - if pdg == 130 or pdg == 22: - return 211 - return pdg - -def sanitize(arr): - arr[np.isnan(arr)] = 0.0 - arr[np.isinf(arr)] = 0.0 - -class EventData: - def __init__(self, - gen_features, - hit_features, - track_features, - genparticle_to_hit, - genparticle_to_track, - hit_to_cluster - ): - self.gen_features = gen_features - self.hit_features = hit_features - self.track_features = track_features - self.genparticle_to_hit = genparticle_to_hit - self.genparticle_to_track = genparticle_to_track - self.hit_to_cluster = hit_to_cluster - -def get_cluster_subdet_energies(hit_list, hit_data, collectionIDs_reverse, iev): - """ - This function calculates the energy contribution from each of four subdetectors in a particle physics experiment, based on a list of hits and their corresponding data. - - Args: - hit_list: a list of tuples, where each tuple contains a collection ID and a hit index - hit_data: a dictionary containing data for each hit in the experiment, organized by collection - collectionIDs_reverse: a dictionary mapping collection IDs to collection names - iev: the event number for the current event - - Returns: - A tuple containing the energy contributions from each of the four subdetectors: - (ecal_energy, hcal_energy, muon_energy, other_energy) - """ - - ecal_energy = 0.0 - hcal_energy = 0.0 - muon_energy = 0.0 - other_energy = 0.0 - - for coll_id, hit_idx in hit_list: - coll = collectionIDs_reverse[coll_id] - hit_energy = hit_data[coll][iev][coll+".energy"][hit_idx] - - if coll.startswith("ECAL"): - ecal_energy += hit_energy - elif coll.startswith("HCAL"): - hcal_energy += hit_energy - elif coll == "MUON": - muon_energy += hit_energy - else: - other_energy += hit_energy - - return ecal_energy, hcal_energy, muon_energy, other_energy - -def hits_to_features(hit_data, iev, coll, feats): - feat_arr = {f: hit_data[coll + "." + f][iev] for f in feats} - - #set the subdetector type - sdcoll = "subdetector" - feat_arr[sdcoll] = np.zeros(len(feat_arr["type"]), dtype=np.int32) - if coll.startswith("ECAL"): - feat_arr[sdcoll][:] = 0 - elif coll.startswith("HCAL"): - feat_arr[sdcoll][:] = 1 - else: - feat_arr[sdcoll][:] = 2 - - #hit elemtype is always 2 - feat_arr["elemtype"] = 2*np.ones(len(feat_arr["type"]), dtype=np.int32) - - #precompute some approximate et, eta, phi - pos_mag = np.sqrt(feat_arr["position.x"]**2 + feat_arr["position.y"]**2 + feat_arr["position.z"]**2) - px = (feat_arr["position.x"] / pos_mag) * feat_arr["energy"] - py = (feat_arr["position.y"] / pos_mag) * feat_arr["energy"] - pz = (feat_arr["position.z"] / pos_mag) * feat_arr["energy"] - feat_arr["et"] = np.sqrt(px**2+py**2) - feat_arr["eta"] = 0.5*np.log((feat_arr["energy"] + pz)/(feat_arr["energy"] - pz)) - feat_arr["sin_phi"] = py/feat_arr["energy"] - feat_arr["cos_phi"] = px/feat_arr["energy"] - - return awkward.Record(feat_arr) - -def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs): - feats = ["type", "cellID", "energy", "energyError", "time", "position.x", "position.y", "position.z"] - - hit_idx_global = 0 - hit_idx_global_to_local = {} - hit_feature_matrix = [] - for col in sorted(hit_data.keys()): - icol = collectionIDs[col] - hit_features = hits_to_features(hit_data[col], iev, col, feats) - hit_feature_matrix.append(hit_features) - for ihit in range(len(hit_data[col][col+".energy"][iev])): - hit_idx_global_to_local[hit_idx_global] = (icol, ihit) - hit_idx_global += 1 - hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()} - hit_feature_matrix = awkward.Record({ - k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields}) - - #add all edges from genparticle to calohit - calohit_to_gen_weight = calohit_links["CalohitMCTruthLink"]["CalohitMCTruthLink.weight"][iev] - calohit_to_gen_calo_colid = calohit_links["CalohitMCTruthLink#0"]["CalohitMCTruthLink#0.collectionID"][iev] - calohit_to_gen_gen_colid = calohit_links["CalohitMCTruthLink#1"]["CalohitMCTruthLink#1.collectionID"][iev] - calohit_to_gen_calo_idx = calohit_links["CalohitMCTruthLink#0"]["CalohitMCTruthLink#0.index"][iev] - calohit_to_gen_gen_idx = calohit_links["CalohitMCTruthLink#1"]["CalohitMCTruthLink#1.index"][iev] - genparticle_to_hit_matrix_coo0 = [] - genparticle_to_hit_matrix_coo1 = [] - genparticle_to_hit_matrix_w = [] - for calo_colid, calo_idx, gen_colid, gen_idx, w in zip(calohit_to_gen_calo_colid, calohit_to_gen_calo_idx, calohit_to_gen_gen_colid, calohit_to_gen_gen_idx, calohit_to_gen_weight): - genparticle_to_hit_matrix_coo0.append(gen_idx) - genparticle_to_hit_matrix_coo1.append(hit_idx_local_to_global[(calo_colid, calo_idx)]) - genparticle_to_hit_matrix_w.append(w) - - return hit_feature_matrix, (genparticle_to_hit_matrix_coo0, genparticle_to_hit_matrix_coo1, genparticle_to_hit_matrix_w), hit_idx_local_to_global - -def hit_cluster_adj(prop_data, hit_idx_local_to_global, iev): - coll_arr = prop_data["PandoraClusters#1"]["PandoraClusters#1.collectionID"][iev] - idx_arr = prop_data["PandoraClusters#1"]["PandoraClusters#1.index"][iev] - hits_begin = prop_data["PandoraClusters"]["PandoraClusters.hits_begin"][iev] - hits_end = prop_data["PandoraClusters"]["PandoraClusters.hits_end"][iev] - - #index in the array of all hits - hit_to_cluster_matrix_coo0 = [] - #index in the cluster array - hit_to_cluster_matrix_coo1 = [] - - #weight - hit_to_cluster_matrix_w = [] - - #loop over all clusters - for icluster in range(len(hits_begin)): - - #get the slice in the hit array corresponding to this cluster - hbeg = hits_begin[icluster] - hend = hits_end[icluster] - idx_range = idx_arr[hbeg:hend] - coll_range = coll_arr[hbeg:hend] - - #add edges from hit to cluster - for icol, idx in zip(coll_range, idx_range): - hit_to_cluster_matrix_coo0.append(hit_idx_local_to_global[(icol, idx)]) - hit_to_cluster_matrix_coo1.append(icluster) - hit_to_cluster_matrix_w.append(1.0) - return np.array(hit_to_cluster_matrix_coo0), np.array(hit_to_cluster_matrix_coo1), np.array(hit_to_cluster_matrix_w) - -def gen_to_features(prop_data, iev): - gen_arr = prop_data[mc_coll][iev] - gen_arr = {k.replace(mc_coll+".", ""): gen_arr[k] for k in gen_arr.fields} - - MCParticles_p4 = vector.awk(awkward.zip({ - "mass": gen_arr["mass"], - "x": gen_arr["momentum.x"], - "y": gen_arr["momentum.y"], - "z": gen_arr["momentum.z"]})) - gen_arr["pt"] = MCParticles_p4.pt - gen_arr["eta"] = MCParticles_p4.eta - gen_arr["phi"] = MCParticles_p4.phi - gen_arr["energy"] = MCParticles_p4.energy - - return awkward.Record({ - "PDG": gen_arr["PDG"], - "generatorStatus": gen_arr["generatorStatus"], - "charge": gen_arr["charge"], - "pt": gen_arr["pt"], - "eta": gen_arr["eta"], - "phi": gen_arr["phi"], - "sin_phi": np.sin(gen_arr["phi"]), - "cos_phi": np.cos(gen_arr["phi"]), - "energy": gen_arr["energy"], - }) - -def genparticle_track_adj(sitrack_links, iev): - trk_to_gen_trkidx = sitrack_links["SiTracksMCTruthLink#0"]["SiTracksMCTruthLink#0.index"][iev] - trk_to_gen_genidx = sitrack_links["SiTracksMCTruthLink#1"]["SiTracksMCTruthLink#1.index"][iev] - trk_to_gen_w = sitrack_links["SiTracksMCTruthLink"]["SiTracksMCTruthLink.weight"][iev] - - genparticle_to_track_matrix_coo0 = awkward.to_numpy(trk_to_gen_genidx) - genparticle_to_track_matrix_coo1 = awkward.to_numpy(trk_to_gen_trkidx) - genparticle_to_track_matrix_w = awkward.to_numpy(trk_to_gen_w) - - return genparticle_to_track_matrix_coo0, genparticle_to_track_matrix_coo1, genparticle_to_track_matrix_w - - -def track_to_features(prop_data, iev): - track_arr = prop_data[track_coll][iev] - feats_from_track = ["type", "chi2", "ndf", "dEdx", "dEdxError", "radiusOfInnermostHit"] - ret = {feat: track_arr[track_coll + "." + feat] for feat in feats_from_track} - n_tr = len(ret["type"]) - - #FIXME: add additional track features from track state - - #get the index of the first track state - trackstate_idx = prop_data[track_coll][track_coll + ".trackStates_begin"][iev] - #get the properties of the track at the first track state (at the origin) - for k in ["tanLambda", "D0", "phi", "omega", "Z0", "time", "referencePoint.x", "referencePoint.y", "referencePoint.z"]: - ret[k] = prop_data["SiTracks_1"]["SiTracks_1." + k][iev][trackstate_idx] - - ret["pt"] = track_pt(ret["omega"]) - ret["px"] = np.cos(ret["phi"]) * ret["pt"] - ret["py"] = np.sin(ret["phi"]) * ret["pt"] - ret["pz"] = ret["tanLambda"] * ret["pt"] - ret["p"] = np.sqrt(ret["px"]**2 + ret["py"]**2 + ret["pz"]**2) - cos_theta = np.divide(ret["pz"], ret["p"], where=ret["p"]>0) - theta = np.arccos(cos_theta) - tt = np.tan(theta / 2.0) - eta = awkward.to_numpy(-np.log(tt, where=tt>0)) - eta[tt<=0] = 0.0 - ret["eta"] = eta - - ret["sin_phi"] = np.sin(ret["phi"]) - ret["cos_phi"] = np.cos(ret["phi"]) - - #override track type with 1 - ret["elemtype"] = 1*np.ones(n_tr, dtype=np.int32) - - return awkward.Record(ret) - -def filter_adj(adj, all_to_filtered): - i0s_new = [] - i1s_new = [] - ws_new = [] - for i0, i1, w in zip(*adj): - if i0 in all_to_filtered: - i0_new = all_to_filtered[i0] - i0s_new.append(i0_new) - i1s_new.append(i1) - ws_new.append(w) - return np.array(i0s_new), np.array(i1s_new), np.array(ws_new) - -def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs): - gen_features = gen_to_features(prop_data, iev) - hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs) - hit_to_cluster = hit_cluster_adj(prop_data, hit_idx_local_to_global, iev) - track_features = track_to_features(prop_data, iev) - genparticle_to_track = genparticle_track_adj(sitrack_links, iev) - - n_gp = awkward.count(gen_features["PDG"]) - n_track = awkward.count(track_features["type"]) - n_hit = awkward.count(hit_features["type"]) - - if len(genparticle_to_track[0])>0: - gp_to_track = np.array(coo_matrix( - (genparticle_to_track[2], - (genparticle_to_track[0], genparticle_to_track[1])), - shape=(n_gp, n_track) - ).max(axis=1).todense())[:, 0] - else: - gp_to_track = np.zeros(n_gp) - - if len(genparticle_to_hit[0])>0: - gp_to_calohit = np.array(coo_matrix( - (genparticle_to_hit[2], - (genparticle_to_hit[0], genparticle_to_hit[1])), - shape=(n_gp, n_hit) - ).max(axis=1).todense())[:, 0] - else: - gp_to_calohit = np.zeros(n_gp) - - #60% of the hits of a track must come from the genparticle - gp_in_tracker = gp_to_track>=0.6 - - #the particle should deposit energy to some calo hit - gp_in_calo = gp_to_calohit>0.0 - - gp_interacted_with_detector = gp_in_tracker | gp_in_calo - - #get status 1 particles that are not neutrinos - #and have energy > 100 MeV - mask_visible = ( - (gen_features["generatorStatus"]==1) & - (gen_features["PDG"]!=12) & - (gen_features["PDG"]!=14) & - (gen_features["PDG"]!=16) & - (gen_features["energy"]>0.1) & - gp_interacted_with_detector - ) - idx_all_masked = np.where(mask_visible)[0] - genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} - - gen_features = awkward.Record({ - feat: gen_features[feat][mask_visible] for feat in gen_features.fields - }) - - genparticle_to_hit = filter_adj(genparticle_to_hit, genpart_idx_all_to_filtered) - genparticle_to_track = filter_adj(genparticle_to_track, genpart_idx_all_to_filtered) - - return EventData( - gen_features, - hit_features, - track_features, - genparticle_to_hit, - genparticle_to_track, - hit_to_cluster - ) -def assign_genparticles_to_obj_and_merge(gpdata): +def assign_genparticles_to_obj(gpdata): n_gp = awkward.count(gpdata.gen_features["PDG"]) n_track = awkward.count(gpdata.track_features["type"]) n_hit = awkward.count(gpdata.hit_features["type"]) - gp_to_track = np.array(coo_matrix( - (gpdata.genparticle_to_track[2], - (gpdata.genparticle_to_track[0], gpdata.genparticle_to_track[1])), - shape=(n_gp, n_track) - ).todense()) + gp_to_track = np.array( + coo_matrix( + (gpdata.genparticle_to_track[2], (gpdata.genparticle_to_track[0], gpdata.genparticle_to_track[1])), + shape=(n_gp, n_track), + ).todense() + ) - gp_to_calohit = np.array(coo_matrix( - (gpdata.genparticle_to_hit[2], - (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), - shape=(n_gp, n_hit) - ).todense()) + gp_to_calohit = np.array( + coo_matrix( + (gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit) + ).todense() + ) - #map each genparticle to a track or calohit - gp_to_obj = -1*np.ones((n_gp, 2), dtype=np.int32) + # map each genparticle to a track or calohit + gp_to_obj = -1 * np.ones((n_gp, 2), dtype=np.int32) set_used_tracks = set([]) set_used_calohits = set([]) gps_sorted_energy = sorted(range(n_gp), key=lambda x: gpdata.gen_features["energy"][x], reverse=True) for igp in gps_sorted_energy: - #first check if we can match the genparticle to a track + # first check if we can match the genparticle to a track matched_tracks = gp_to_track[igp] trks = np.where(matched_tracks)[0] trks = sorted(trks, key=lambda x: matched_tracks[x], reverse=True) for trk in trks: - #if the track was not already used for something else + # if the track was not already used for something else if trk not in set_used_tracks: gp_to_obj[igp, 0] = trk set_used_tracks.add(trk) break - #if there was no matched track, try a calohit + # if there was no matched track, try a calohit if gp_to_obj[igp, 0] == -1: matched_calohits = np.where(gp_to_calohit[igp])[0] calohits = sorted(matched_calohits, key=lambda x: gp_to_calohit[igp, x], reverse=True) @@ -417,114 +109,86 @@ def assign_genparticles_to_obj_and_merge(gpdata): set_used_calohits.add(calohit) break - unmatched = (gp_to_obj[:, 0]!=-1) & (gp_to_obj[:, 1]!=-1) + # unmatched = (gp_to_obj[:, 0] != -1) & (gp_to_obj[:, 1] != -1) return gp_to_obj -#for each PF element (track, cluster), get the index of the best-matched particle (gen or reco) -#if the PF element has no best-matched particle, returns -1 -def assign_to_recoobj(n_obj, obj_to_ptcl, used_particles): - obj_to_ptcl_all = -1 * np.ones(n_obj, dtype=np.int64) - for iobj in range(n_obj): - if iobj in obj_to_ptcl: - iptcl = obj_to_ptcl[iobj] - obj_to_ptcl_all[iobj] = iptcl - assert(used_particles[iptcl] == 0) - used_particles[iptcl] = 1 - return obj_to_ptcl_all - def get_recoptcl_to_obj(n_rps, reco_arr, gpdata, idx_rp_to_track, idx_rp_to_cluster): track_to_rp = {} calohit_to_rp = {} for irp in range(n_rps): assigned = False + + # get the tracks of the reco particle trks_begin = reco_arr["tracks_begin"][irp] trks_end = reco_arr["tracks_end"][irp] for itrk in range(trks_begin, trks_end): + + # get the index of the track itrk_real = idx_rp_to_track[itrk] - assert(itrk_real not in track_to_rp) + assert itrk_real not in track_to_rp track_to_rp[itrk_real] = irp assigned = True - #only look for calohits if tracks were not found + # only look for calohits if tracks were not found if not assigned: + + # loop over clusters of the reco particle cls_begin = reco_arr["clusters_begin"][irp] cls_end = reco_arr["clusters_end"][irp] for icls in range(cls_begin, cls_end): + + # get the index of the cluster icls_real = idx_rp_to_cluster[icls] - #find hits of the cluster - calohit_inds = gpdata.hit_to_cluster[0][gpdata.hit_to_cluster[1]==icls_real] + # find hits of the cluster + calohit_inds = gpdata.hit_to_cluster[0][gpdata.hit_to_cluster[1] == icls_real] - #get the highest-energy hit + # get the highest-energy hit calohits_e_ascending = np.argsort(gpdata.hit_features["energy"][calohit_inds]) highest_e_hit = calohit_inds[calohits_e_ascending[-1]] - assert(highest_e_hit not in calohit_to_rp) + assert highest_e_hit not in calohit_to_rp calohit_to_rp[highest_e_hit] = irp assigned = True break return track_to_rp, calohit_to_rp -def get_reco_properties(prop_data, iev): - reco_arr = prop_data["MergedRecoParticles"][iev] - reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields} - - reco_p4 = vector.awk(awkward.zip({ - "mass": reco_arr["mass"], - "x": reco_arr["momentum.x"], - "y": reco_arr["momentum.y"], - "z": reco_arr["momentum.z"]})) - reco_arr["pt"] = reco_p4.pt - reco_arr["eta"] = reco_p4.eta - reco_arr["phi"] = reco_p4.phi - reco_arr["energy"] = reco_p4.energy - - msk = reco_arr["type"]!=0 - reco_arr = awkward.Record({k: reco_arr[k][msk] for k in reco_arr.keys()}) - return reco_arr - -def get_particle_feature_matrix(pfelem_to_particle, feature_dict, features): - feats = [] - for feat in features: - feat_arr = feature_dict[feat] - if len(feat_arr)==0: - feat_arr_reordered = feat_arr - else: - feat_arr_reordered = awkward.to_numpy(feat_arr[pfelem_to_particle]) - feat_arr_reordered[pfelem_to_particle==-1] = 0.0 - feats.append(feat_arr_reordered) - feats = np.array(feats) - return feats.T - -def get_feature_matrix(feature_dict, features): - feats = [] - for feat in features: - feat_arr = awkward.to_numpy(feature_dict[feat]) - feats.append(feat_arr) - feats = np.array(feats) - return feats.T def process_one_file(fn, ofn): - #output exists, do not recreate + # output exists, do not recreate if os.path.isfile(ofn): return print(fn) fi = uproot.open(fn) - + arrs = fi["events"] - - collectionIDs = {k: v for k, v in - zip(fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_names"][0], - fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_collectionIDs"][0])} - collectionIDs_reverse = {v: k for k, v in collectionIDs.items()} - - prop_data = arrs.arrays([mc_coll, track_coll, "SiTracks_1", "PandoraClusters", "PandoraClusters#1", "PandoraClusters#0", "MergedRecoParticles"]) + + collectionIDs = { + k: v + for k, v in zip( + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_names"][0], + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_collectionIDs"][0], + ) + } + + prop_data = arrs.arrays( + [ + mc_coll, + track_coll, + "SiTracks_1", + "PandoraClusters", + "PandoraClusters#1", + "PandoraClusters#0", + "MergedRecoParticles", + ] + ) calohit_links = arrs.arrays(["CalohitMCTruthLink", "CalohitMCTruthLink#0", "CalohitMCTruthLink#1"]) sitrack_links = arrs.arrays(["SiTracksMCTruthLink", "SiTracksMCTruthLink#0", "SiTracksMCTruthLink#1"]) - #maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) to the index in the track/cluster collection + # maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) + # to the index in the track/cluster collection idx_rp_to_cluster = arrs["MergedRecoParticles#0/MergedRecoParticles#0.index"].array() idx_rp_to_track = arrs["MergedRecoParticles#1/MergedRecoParticles#1.index"].array() @@ -539,107 +203,86 @@ def process_one_file(fn, ofn): } ret = [] - ret_unused_pt = [] for iev in range(arrs.num_entries): - #get the reco particles + # get the reco particles reco_arr = get_reco_properties(prop_data, iev) reco_type = np.abs(reco_arr["type"]) n_rps = len(reco_type) - reco_features = awkward.Record({ - "PDG": np.abs(reco_type), - "charge": reco_arr["charge"], - "pt": reco_arr["pt"], - "eta": reco_arr["eta"], - "sin_phi": np.sin(reco_arr["phi"]), - "cos_phi": np.cos(reco_arr["phi"]), - "energy": reco_arr["energy"] - }) - - #get the genparticles and the links between genparticles and tracks/clusters + reco_features = awkward.Record( + { + "PDG": np.abs(reco_type), + "charge": reco_arr["charge"], + "pt": reco_arr["pt"], + "eta": reco_arr["eta"], + "sin_phi": np.sin(reco_arr["phi"]), + "cos_phi": np.cos(reco_arr["phi"]), + "energy": reco_arr["energy"], + } + ) + + # get the genparticles and the links between genparticles and tracks/clusters gpdata = get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs) - #find the reconstructable genparticles and associate them to the best track/cluster - gp_to_obj = assign_genparticles_to_obj_and_merge(gpdata) + # find the reconstructable genparticles and associate them to the best track/cluster + gp_to_obj = assign_genparticles_to_obj(gpdata) n_tracks = len(gpdata.track_features["type"]) n_hits = len(gpdata.hit_features["type"]) n_gps = len(gpdata.gen_features["PDG"]) + print("hits={} tracks={} gps={}".format(n_hits, n_tracks, n_gps)) + + assert len(gp_to_obj) == len(gpdata.gen_features["PDG"]) + assert gp_to_obj.shape[1] == 2 - assert(len(gp_to_obj) == len(gpdata.gen_features["PDG"])) - assert(gp_to_obj.shape[1] == 2) - - #for each reco particle, find the tracks and clusters associated with it - #construct track/cluster -> recoparticle maps + # for each reco particle, find the tracks and clusters associated with it + # construct track/cluster -> recoparticle maps track_to_rp, hit_to_rp = get_recoptcl_to_obj(n_rps, reco_arr, gpdata, idx_rp_to_track[iev], idx_rp_to_cluster[iev]) - #get the track/cluster -> genparticle map + # get the track/cluster -> genparticle map track_to_gp = {itrk: igp for igp, itrk in enumerate(gp_to_obj[:, 0]) if itrk != -1} hit_to_gp = {ihit: igp for igp, ihit in enumerate(gp_to_obj[:, 1]) if ihit != -1} + # keep track if all genparticles were used used_gps = np.zeros(n_gps, dtype=np.int64) + + # assign all track-associated genparticles to a track track_to_gp_all = assign_to_recoobj(n_tracks, track_to_gp, used_gps) + + # assign all calohit-associated genparticles to a calohit hit_to_gp_all = assign_to_recoobj(n_hits, hit_to_gp, used_gps) - if not np.all(used_gps==1): - print("unmatched gen", gpdata.gen_features["energy"][used_gps==0]) - #assert(np.all(used_gps == 1)) + if not np.all(used_gps == 1): + print("unmatched gen", gpdata.gen_features["energy"][used_gps == 0]) used_rps = np.zeros(n_rps, dtype=np.int64) track_to_rp_all = assign_to_recoobj(n_tracks, track_to_rp, used_rps) hit_to_rp_all = assign_to_recoobj(n_hits, hit_to_rp, used_rps) - if not np.all(used_rps==1): - print("unmatched reco", reco_features["energy"][used_rps==0]) - #assert(np.all(used_rps == 1)) - - gps_track = get_particle_feature_matrix( - track_to_gp_all, - gpdata.gen_features, - particle_feature_order - ) - gps_track[:, 0] = np.array([ - map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] - ) - gps_hit = get_particle_feature_matrix( - hit_to_gp_all, - gpdata.gen_features, - particle_feature_order + if not np.all(used_rps == 1): + print("unmatched reco", reco_features["energy"][used_rps == 0]) + + gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata.gen_features, particle_feature_order) + gps_track[:, 0] = np.array( + [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])] ) - gps_hit[:, 0] = np.array([ - map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_hit[:, 0], gps_hit[:, 1])] + gps_hit = get_particle_feature_matrix(hit_to_gp_all, gpdata.gen_features, particle_feature_order) + gps_hit[:, 0] = np.array( + [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_hit[:, 0], gps_hit[:, 1])] ) gps_hit[:, 1] = 0 - rps_track = get_particle_feature_matrix( - track_to_rp_all, - reco_features, - particle_feature_order - ) - rps_track[:, 0] = np.array([ - map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] - ) - rps_hit = get_particle_feature_matrix( - hit_to_rp_all, - reco_features, - particle_feature_order + rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order) + rps_track[:, 0] = np.array( + [map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])] ) - rps_hit[:, 0] = np.array([ - map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_hit[:, 0], rps_hit[:, 1])] + rps_hit = get_particle_feature_matrix(hit_to_rp_all, reco_features, particle_feature_order) + rps_hit[:, 0] = np.array( + [map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_hit[:, 0], rps_hit[:, 1])] ) rps_hit[:, 1] = 0 - #all initial gen/reco particle energy must be reconstructable - #assert(abs( - # np.sum(gps_track[:, 6]) + np.sum(gps_hit[:, 6]) - np.sum(gpdata.gen_features["energy"]) - # ) < 1e-2) - - #assert(abs( - # np.sum(rps_track[:, 6]) + np.sum(rps_hit[:, 6]) - np.sum(reco_features["energy"]) - # ) < 1e-2) - - - #we don't want to try to reconstruct charged particles from primary clusters, make sure the charge is 0 - assert(np.all(gps_hit[:, 1] == 0)) - assert(np.all(rps_hit[:, 1] == 0)) + # we don't want to try to reconstruct charged particles from primary clusters, make sure the charge is 0 + assert np.all(gps_hit[:, 1] == 0) + assert np.all(rps_hit[:, 1] == 0) X_track = get_feature_matrix(gpdata.track_features, track_feature_order) X_hit = get_feature_matrix(gpdata.hit_features, hit_feature_order) @@ -663,34 +306,26 @@ def process_one_file(fn, ofn): "ycand_track": ycand_track, "ycand_hit": ycand_hit, } - if np.sum(used_gps==0)>0: - ret_unused_pt.append(awkward.to_numpy(gpdata.gen_features["pt"][used_gps==0])) - else: - ret_unused_pt.append(np.array([], dtype=np.float32)) this_ev = awkward.Record(this_ev) - ret.append(this_ev) ret = {k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields} - - ntot = sum([len(x) for x in ret_unused_pt]) - if ntot>0: - ret["ygen_unused_pt"] = awkward.from_iter(ret_unused_pt) - else: - ret["ygen_unused_pt"] = build_dummy_array(len(ret_unused_pt), dtype=np.float32) + for k in ret.keys(): + if len(awkward.flatten(ret[k])) == 0: + ret[k] = build_dummy_array(len(ret[k]), np.float32) ret = awkward.Record(ret) - awkward.to_parquet(ret, ofn) + def process_sample(samp): - inp = "/media/joosep/data/clic_edm4hep_2023_02_27/" - outp = "/media/joosep/data/mlpf_hits/clic_edm4hep_2023_02_27/" + inp = "/local/joosep/clic_edm4hep/" + outp = "/local/joosep/mlpf_hits/clic_edm4hep/" - pool = multiprocessing.Pool(15) + pool = multiprocessing.Pool(8) inpath_samp = inp + samp outpath_samp = outp + samp - infiles = list(glob.glob(inpath_samp + "/*.root"))[:10000] + infiles = list(glob.glob(inpath_samp + "/*.root")) if not os.path.isdir(outpath_samp): os.makedirs(outpath_samp) @@ -700,6 +335,7 @@ def process_sample(samp): args.append((inf, of)) pool.starmap(process_one_file, args) + if __name__ == "__main__": if len(sys.argv) == 2: process_sample(sys.argv[1]) diff --git a/fcc/run_pandora_timing.sh b/fcc/run_pandora_timing.sh new file mode 100755 index 000000000..670ce52cc --- /dev/null +++ b/fcc/run_pandora_timing.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +SLURM_JOB_ID=1 ./run_sim_gun_np.sh 1 pi- 100 &> gun_np_100_1.txt +for iseed in 6 7; do + for nptcl in 25 50 100 200; do + SLURM_JOB_ID=$iseed ./run_sim_gun_np.sh $iseed pi- $nptcl &> gun_np_${nptcl}_${iseed}.txt + done +done diff --git a/fcc/run_sim.sh b/fcc/run_sim.sh index 321c3374c..913f2866b 100755 --- a/fcc/run_sim.sh +++ b/fcc/run_sim.sh @@ -41,8 +41,8 @@ source /cvmfs/sw.hsf.org/spackages6/key4hep-stack/2023-01-15/x86_64-centos7-gcc1 k4run $PFDIR/fcc/pythia.py -n $NEV --Dumper.Filename out.hepmc --Pythia8.PythiaInterface.pythiacard card.cmd ddsim --compactFile $LCGEO/CLIC/compact/CLIC_o3_v14/CLIC_o3_v14.xml \ - --outputFile out_sim_edm4hep.root \ --steeringFile clic_steer.py \ + --outputFile out_sim_edm4hep.root \ --inputFiles out.hepmc \ --numberOfEvents $NEV \ --random.seed $NUM diff --git a/fcc/run_sim_gun.sh b/fcc/run_sim_gun.sh new file mode 100755 index 000000000..986a39721 --- /dev/null +++ b/fcc/run_sim_gun.sh @@ -0,0 +1,54 @@ +#!/bin/bash +#SBATCH -p main +#SBATCH --mem-per-cpu=35G +#SBATCH --cpus-per-task=1 +#SBATCH -o logs/slurm-%x-%j-%N.out +#SBATCH --no-requeue +set -e +set -x + +env +df -h + +OUTDIR=/local/joosep/clic_edm4hep_gun/ +PFDIR=/home/joosep/particleflow +NEV=100 + +NUM=$1 #random seed +SAMPLE=$2 #main card + + +WORKDIR=/scratch/local/$USER/${SAMPLE}_${SLURM_JOB_ID} +FULLOUTDIR=${OUTDIR}/${SAMPLE} + +mkdir -p $FULLOUTDIR + +mkdir -p $WORKDIR +cd $WORKDIR + +#cp $PFDIR/fcc/main ./ +cp $PFDIR/fcc/pythia.py ./ +cp $PFDIR/fcc/clic_steer.py ./ +cp -R $PFDIR/fcc/PandoraSettings ./ +cp -R $PFDIR/fcc/clicRec_e4h_input.py ./ + +#without PU +source /cvmfs/sw.hsf.org/spackages6/key4hep-stack/2023-01-15/x86_64-centos7-gcc11.2.0-opt/csapx/setup.sh + +ddsim --compactFile $LCGEO/CLIC/compact/CLIC_o3_v14/CLIC_o3_v14.xml \ + --steeringFile clic_steer.py \ + --enableGun \ + --gun.distribution uniform \ + --gun.particle $SAMPLE \ + --gun.momentumMin 1*GeV \ + --gun.momentumMax 100*GeV \ + --outputFile out_sim_edm4hep.root \ + --numberOfEvents $NEV \ + --random.seed $NUM +cp out_sim_edm4hep.root $FULLOUTDIR/sim_${SAMPLE}_${NUM}.root + +k4run clicRec_e4h_input.py -n $NEV --EventDataSvc.input out_sim_edm4hep.root --PodioOutput.filename out_reco_edm4hep.root +cp out_reco_edm4hep.root $FULLOUTDIR/reco_${SAMPLE}_${NUM}.root +cp timing_histos.root $FULLOUTDIR/timing_${SAMPLE}_${NUM}.root + +rm -Rf $WORKDIR diff --git a/fcc/run_sim_gun_np.sh b/fcc/run_sim_gun_np.sh new file mode 100755 index 000000000..f26aaf750 --- /dev/null +++ b/fcc/run_sim_gun_np.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -p main +#SBATCH --mem-per-cpu=35G +#SBATCH --cpus-per-task=1 +#SBATCH -o logs/slurm-%x-%j-%N.out +#SBATCH --no-requeue +set -e +set -x + +env +df -h + +PFDIR=/home/joosep/particleflow +NEV=10 + +NUM=$1 #random seed +SAMPLE=$2 #main card +NUMPART=$3 #number of particles + +OUTDIR=/home/joosep/clic_edm4hep_gun_np$NUMPART/ + +WORKDIR=/scratch/$USER/${SAMPLE}_${SLURM_JOB_ID} +FULLOUTDIR=${OUTDIR}/${SAMPLE} + +mkdir -p $FULLOUTDIR + +mkdir -p $WORKDIR +cd $WORKDIR + +#cp $PFDIR/fcc/main ./ +cp $PFDIR/fcc/pythia.py ./ +cp $PFDIR/fcc/clic_steer.py ./ +cp -R $PFDIR/fcc/PandoraSettings ./ +cp -R $PFDIR/fcc/clicRec_e4h_input.py ./ + +#without PU +source /cvmfs/sw.hsf.org/spackages6/key4hep-stack/2023-01-15/x86_64-centos7-gcc11.2.0-opt/csapx/setup.sh + +ddsim --compactFile $LCGEO/CLIC/compact/CLIC_o3_v14/CLIC_o3_v14.xml \ + --steeringFile clic_steer.py \ + --enableGun \ + --gun.distribution uniform \ + --gun.multiplicity $NUMPART \ + --gun.particle $SAMPLE \ + --gun.momentumMin 1*GeV \ + --gun.momentumMax 100*GeV \ + --outputFile out_sim_edm4hep.root \ + --numberOfEvents $NEV \ + --random.seed $NUM +cp out_sim_edm4hep.root $FULLOUTDIR/sim_${SAMPLE}_${NUM}.root + +k4run clicRec_e4h_input.py -n $NEV --EventDataSvc.input out_sim_edm4hep.root --PodioOutput.filename out_reco_edm4hep.root +cp out_reco_edm4hep.root $FULLOUTDIR/reco_${SAMPLE}_${NUM}.root +cp timing_histos.root $FULLOUTDIR/timing_${SAMPLE}_${NUM}.root + +rm -Rf $WORKDIR diff --git a/mlpf/customizations.py b/mlpf/customizations.py index 9e9188af1..b0234ea95 100644 --- a/mlpf/customizations.py +++ b/mlpf/customizations.py @@ -19,10 +19,18 @@ def customize_pipeline_test(config): if "clic_edm_ttbar_pf" in config["datasets"]: config["train_test_datasets"]["physical"]["datasets"] = ["clic_edm_ttbar_pf"] config["train_test_datasets"] = {"physical": config["train_test_datasets"]["physical"]} - config["train_test_datasets"]["physical"]["batch_per_gpu"] = 50 + config["train_test_datasets"]["physical"]["batch_per_gpu"] = 5 config["validation_dataset"] = "clic_edm_ttbar_pf" - config["validation_batch_size"] = 50 - config["evaluation_datasets"] = {"clic_edm_ttbar_pf": {"batch_size": 50, "num_events": -1}} + config["validation_batch_size"] = 5 + config["evaluation_datasets"] = {"clic_edm_ttbar_pf": {"batch_size": 5, "num_events": -1}} + + if "clic_edm_ttbar_hits_pf" in config["datasets"]: + config["train_test_datasets"]["physical"]["datasets"] = ["clic_edm_ttbar_hits_pf"] + config["train_test_datasets"] = {"physical": config["train_test_datasets"]["physical"]} + config["train_test_datasets"]["physical"]["batch_per_gpu"] = 1 + config["validation_dataset"] = "clic_edm_ttbar_hits_pf" + config["validation_batch_size"] = 1 + config["evaluation_datasets"] = {"clic_edm_ttbar_hits_pf": {"batch_size": 1, "num_events": -1}} # validate only on a small number of events config["validation_num_events"] = config["validation_batch_size"] * 2 diff --git a/mlpf/heptfds/clic_pf_edm4hep/qq.py b/mlpf/heptfds/clic_pf_edm4hep/qq.py index c074df8f9..fbe2aa345 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/qq.py +++ b/mlpf/heptfds/clic_pf_edm4hep/qq.py @@ -20,13 +20,14 @@ class ClicEdmQqPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.3.1") + VERSION = tfds.core.Version("1.4.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", "1.2.0": "sin cos as separate features", "1.3.0": "Update stats to ~1M events", "1.3.1": "Update stats to ~2M events", + "1.4.0": "Fix ycand matching", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py index 0e0fad827..09fb0c9fb 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -20,12 +20,13 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.3.0") + VERSION = tfds.core.Version("1.4.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", "1.2.0": "sin/cos phi separately", "1.3.0": "Update stats to ~1M events", + "1.4.0": "Fix ycand matching", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py index 215873e58..b5993434a 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar_pu10.py @@ -20,9 +20,10 @@ class ClicEdmTtbarPu10Pf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.3.0") + VERSION = tfds.core.Version("1.4.0") RELEASE_NOTES = { "1.3.0": "Update stats to ~1M events", + "1.4.0": "Fix ycand matching", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index 39f625f1c..2d07aace0 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -8,7 +8,7 @@ # from fcc/postprocessing.py X_FEATURES_TRK = [ - "type", + "elemtype", "pt", "eta", "sin_phi", @@ -26,7 +26,7 @@ "time", ] X_FEATURES_CL = [ - "type", + "elemtype", "et", "eta", "sin_phi", @@ -64,6 +64,27 @@ def split_sample(path, test_frac=0.8): } +def split_sample_several(paths, test_frac=0.8): + files_train_tot = [] + files_test_tot = [] + for path in paths: + files = sorted(list(path.glob("*.parquet"))) + print("Found {} files in {}".format(files, path)) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + files_train_tot.append(files_train) + files_test_tot.append(files_test) + + return { + "train": generate_examples(files_train_tot), + "test": generate_examples(files_test_tot), + } + + def prepare_data_clic(fn, with_jet_idx=True): ret = ak.from_parquet(fn) diff --git a/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py b/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py index 2e16d860d..b4db98f30 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py @@ -20,9 +20,10 @@ class ClicEdmWwFullhadPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.3.0") + VERSION = tfds.core.Version("1.4.0") RELEASE_NOTES = { "1.3.0": "Update stats to ~1M events", + "1.4.0": "Fix ycand matching", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ diff --git a/mlpf/heptfds/clic_pf_edm4hep/zh.py b/mlpf/heptfds/clic_pf_edm4hep/zh.py index 436e1e1cc..a97ec64ca 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/zh.py +++ b/mlpf/heptfds/clic_pf_edm4hep/zh.py @@ -20,9 +20,10 @@ class ClicEdmZhTautauPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.3.0") + VERSION = tfds.core.Version("1.4.0") RELEASE_NOTES = { "1.3.0": "First version", + "1.4.0": "Fix ycand matching", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep_2023_02_27/ ./ diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py b/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py index c1215996e..126da7cff 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py @@ -20,10 +20,12 @@ class ClicEdmQqHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("0.9.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "0.9.0": "Small stats", "1.0.0": "Initial release", + "1.1.0": "Remove track referencepoint feature", + "1.2.0": "Keep all interacting genparticles", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ FIXME diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py index 0ba4fd564..21f9be93e 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py @@ -20,10 +20,12 @@ class ClicEdmTtbarHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("0.9.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "0.9.0": "Small stats", "1.0.0": "Initial release", + "1.1.0": "Remove track referencepoint feature", + "1.2.0": "Keep all interacting genparticles", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ FIXME diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py index 3ecad14b4..93e1acc73 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py @@ -1,6 +1,7 @@ import awkward as ak import numpy as np import tqdm +import random # from fcc/postprocessing_hits.py X_FEATURES_TRK = [ @@ -16,9 +17,6 @@ "tanLambda", "D0", "omega", - "referencePoint.x", - "referencePoint.y", - "referencePoint.z", "Z0", "time", "type", @@ -37,13 +35,14 @@ "subdetector", "type", ] +X_FEAT_NUM = max(len(X_FEATURES_TRK), len(X_FEATURES_CH)) Y_FEATURES = ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy"] labels = [0, 211, 130, 22, 11, 13] def split_sample(path, test_frac=0.8): - files = sorted(list(path.glob("*.parquet")))[:1000] + files = sorted(list(path.glob("*.parquet"))) print("Found {} files in {}".format(len(files), path)) assert len(files) > 0 idx_split = int(test_frac * len(files)) @@ -57,8 +56,23 @@ def split_sample(path, test_frac=0.8): } +def split_sample_several(paths, test_frac=0.8): + files = sum([list(path.glob("*.parquet")) for path in paths], []) + random.shuffle(files) + print("Found {} files".format(len(files))) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + return { + "train": generate_examples(files_train), + "test": generate_examples(files_test), + } + + def prepare_data_clic(fn): - print(fn) ret = ak.from_parquet(fn) X_track = ret["X_track"] @@ -75,24 +89,30 @@ def prepare_data_clic(fn): X1 = ak.to_numpy(X_track[iev]) X2 = ak.to_numpy(X_hit[iev]) - if len(X1) == 0 or len(X2) == 0: + if len(X1) == 0 and len(X2) == 0: continue ygen_track = ak.to_numpy(ret["ygen_track"][iev]) ygen_hit = ak.to_numpy(ret["ygen_hit"][iev]) ycand_track = ak.to_numpy(ret["ycand_track"][iev]) ycand_hit = ak.to_numpy(ret["ycand_hit"][iev]) - - if len(ygen_track) == 0 or len(ygen_hit) == 0: + if ygen_track.shape[0] == 0: + ygen_track = np.zeros((0, 7), dtype=np.float32) + if ycand_track.shape[0] == 0: + ycand_track = np.zeros((0, 7), dtype=np.float32) + if ygen_hit.shape[0] == 0: + ygen_hit = np.zeros((0, 7), dtype=np.float32) + if ycand_hit.shape[0] == 0: + ycand_hit = np.zeros((0, 7), dtype=np.float32) + + if len(ygen_track) == 0 and len(ygen_hit) == 0: continue - if len(ycand_track) == 0 or len(ycand_hit) == 0: + if len(ycand_track) == 0 and len(ycand_hit) == 0: continue # pad feature dim between tracks and hits to the same size - if X1.shape[1] < X2.shape[1]: - X1 = np.pad(X1, [[0, 0], [0, X2.shape[1] - X1.shape[1]]]) - if X2.shape[1] < X1.shape[1]: - X2 = np.pad(X2, [[0, 0], [0, X1.shape[1] - X2.shape[1]]]) + X1 = np.pad(X1, [[0, 0], [0, X_FEAT_NUM - X1.shape[1]]]) + X2 = np.pad(X2, [[0, 0], [0, X_FEAT_NUM - X2.shape[1]]]) # concatenate tracks and hits in features and targets X = np.concatenate([X1, X2]) @@ -106,7 +126,6 @@ def prepare_data_clic(fn): ygen[:, 0][:] = arr[:] arr = np.array([labels.index(p) for p in ycand[:, 0]]) ycand[:, 0][:] = arr[:] - Xs.append(X) ygens.append(ygen) ycands.append(ycand) @@ -115,16 +134,13 @@ def prepare_data_clic(fn): def generate_examples(files): for fi in tqdm.tqdm(files): - try: - Xs, ygens, ycands = prepare_data_clic(fi) - for iev in range(len(Xs)): - yield str(fi) + "_" + str(iev), { - "X": Xs[iev].astype(np.float32), - "ygen": ygens[iev].astype(np.float32), - "ycand": ycands[iev].astype(np.float32), - } - except Exception as e: - print("could not process {}: {}".format(fi, e)) + Xs, ygens, ycands = prepare_data_clic(fi) + for iev in range(len(Xs)): + yield str(fi) + "_" + str(iev), { + "X": Xs[iev].astype(np.float32), + "ygen": ygens[iev].astype(np.float32), + "ycand": ycands[iev].astype(np.float32), + } if __name__ == "__main__": diff --git a/mlpf/lumi/train-gpu-1.sh b/mlpf/lumi/train-gpu-1.sh deleted file mode 100755 index b9bae9c4b..000000000 --- a/mlpf/lumi/train-gpu-1.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms-gen -#SBATCH --account=project_465000301 -#SBATCH --time=24:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=120G -#SBATCH --gres=gpu:mi250:1 -#SBATCH --partition=eap -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -IMG=/users/patajoos/tf-rocm.simg -cd ~/particleflow - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 16 \ - --batch-multiplier 10 diff --git a/mlpf/lumi/train-gpu-2.sh b/mlpf/lumi/train-gpu-2.sh deleted file mode 100755 index 65fa4c67a..000000000 --- a/mlpf/lumi/train-gpu-2.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms-gen -#SBATCH --account=project_465000301 -#SBATCH --time=24:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=120G -#SBATCH --gres=gpu:mi250:2 -#SBATCH --partition=eap -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -IMG=/users/patajoos/tf-rocm.simg -cd ~/particleflow - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 16 \ - --batch-multiplier 10 diff --git a/mlpf/lumi/train-gpu-4.sh b/mlpf/lumi/train-gpu-4.sh deleted file mode 100755 index 6f0c08edc..000000000 --- a/mlpf/lumi/train-gpu-4.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-cms-gen -#SBATCH --account=project_465000301 -#SBATCH --time=24:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=120G -#SBATCH --gres=gpu:mi250:4 -#SBATCH --partition=eap -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -IMG=/users/patajoos/tf-rocm.simg -cd ~/particleflow - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 16 \ - --batch-multiplier 10 diff --git a/mlpf/lumi/train-gpu-clic.sh b/mlpf/lumi/train-gpu-clic.sh deleted file mode 100755 index 6966fd313..000000000 --- a/mlpf/lumi/train-gpu-clic.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mlpf-train-clic -#SBATCH --account=project_465000301 -#SBATCH --time=24:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=16 -#SBATCH --mem=120G -#SBATCH --gres=gpu:mi250:1 -#SBATCH --partition=eap -#SBATCH --no-requeue -#SBATCH -o logs/slurm-%x-%j-%N.out - -IMG=/users/patajoos/tf-rocm.simg -cd ~/particleflow - -#TF training -singularity exec \ - --rocm \ - -B /scratch/project_465000301 \ - --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/clic.yaml --plot-freq 1 --num-cpus 16 \ - --batch-multiplier 10 diff --git a/mlpf/lumi/train-gpu-ln-full.sh b/mlpf/lumi/train-gpu-ln-full.sh new file mode 100755 index 000000000..e94eb0fcc --- /dev/null +++ b/mlpf/lumi/train-gpu-ln-full.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH --job-name=mlpf-train-clic-hits-ln-full +#SBATCH --account=project_465000301 +#SBATCH --time=3-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=130G +#SBATCH --gpus-per-task=8 +#SBATCH --partition=small-g +#SBATCH --no-requeue +#SBATCH -o logs/slurm-%x-%j-%N.out + +cd /scratch/project_465000301/particleflow + +module load LUMI/22.08 partition/G + +export IMG=/scratch/project_465000301/tf-rocm.simg +export PYTHONPATH=hep_tfds +export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets +#export MIOPEN_DISABLE_CACHE=true +export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages +#export MIOPEN_ENABLE_LOGGING=1 +#export MIOPEN_ENABLE_LOGGING_CMD=1 +#export MIOPEN_LOG_LEVEL=4 + +#TF training +singularity exec \ + --rocm \ + -B /scratch/project_465000301 \ + -B /tmp \ + --env LD_LIBRARY_PATH=/opt/rocm-5.4.0/lib/ \ + $IMG python3 mlpf/pipeline.py train \ + --config parameters/clic-hits-ln.yaml --plot-freq 1 --num-cpus 8 \ + --batch-multiplier 2 \ + --weights experiments/clic-hits-ln_20230623_090308_368360.nid007329/weights/weights-10-0.163285.hdf5 + +# --env MIOPEN_USER_DB_PATH=$MIPEN_USER_DB_PATH \ +# --env MIOPEN_CUSTOM_CACHE_DIR=$MIOPEN_CUSTOM_CACHE_DIR \ +# --env MIOPEN_ENABLE_LOGGING=1 \ +# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ +# --env MIOPEN_LOG_LEVEL=7 \ +# --env MIOPEN_ENABLE_LOGGING=1 \ +# --env MIOPEN_ENABLE_LOGGING_CMD=1 \ +# --env MIOPEN_LOG_LEVEL=5 \ diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index 52fcf0c07..3ab0e2e32 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -241,6 +241,9 @@ def train( ds_train, ds_test, ds_val = get_train_test_val_datasets(config, num_batches_multiplier, ntrain, ntest) + ds_train.tensorflow_dataset = ds_train.tensorflow_dataset.prefetch(tf.data.AUTOTUNE) + ds_test.tensorflow_dataset = ds_test.tensorflow_dataset.prefetch(tf.data.AUTOTUNE) + epochs = config["setup"]["num_epochs"] total_steps = ds_train.num_steps() * epochs logging.info("num_train_steps: {}".format(ds_train.num_steps())) @@ -283,9 +286,22 @@ def train( callbacks.append(optim_callbacks) - model.normalizer.adapt(ds_train.tensorflow_dataset.map(lambda X, y, w: X[:, :, 1:])) - print(model.normalizer.mean) - print(model.normalizer.variance) + if not os.path.isfile(config["setup"]["normalizer_cache"] + ".npz"): + logging.info( + "Could not find normalizer cache in {}, recreating".format(config["setup"]["normalizer_cache"] + ".npz") + ) + model.normalizer.adapt(ds_train.tensorflow_dataset.map(lambda X, y, w: X[:, :, 1:])) + print(model.normalizer.mean) + print(model.normalizer.variance) + np.savez( + config["setup"]["normalizer_cache"], + mean=model.normalizer.mean.numpy(), + variance=model.normalizer.variance.numpy(), + ) + + cache = np.load(config["setup"]["normalizer_cache"] + ".npz") + model.normalizer.mean = tf.convert_to_tensor(cache["mean"]) + model.normalizer.variance = tf.convert_to_tensor(cache["variance"]) model.fit( ds_train.tensorflow_dataset.repeat(), @@ -336,6 +352,17 @@ def evaluate(config, train_dir, weights, customize, nevents): model, _, initial_epoch = model_scope(config, 1, weights=weights) + print("before loading") + print(model.normalizer.mean) + print(model.normalizer.variance) + + cache = np.load(config["setup"]["normalizer_cache"] + ".npz") + model.normalizer.mean = tf.convert_to_tensor(cache["mean"]) + model.normalizer.variance = tf.convert_to_tensor(cache["variance"]) + print("after loading") + print(model.normalizer.mean) + print(model.normalizer.variance) + for dsname in config["evaluation_datasets"]: val_ds = config["evaluation_datasets"][dsname] ds_test = mlpf_dataset_from_config( diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py index d0896e1fb..caaa28235 100644 --- a/mlpf/plotting/plot_utils.py +++ b/mlpf/plotting/plot_utils.py @@ -621,7 +621,7 @@ def plot_sum_energy(yvals, class_names, epoch=None, cp_dir=None, comet_experimen plt.xlabel("total energy / event [GeV]") plt.ylabel("events / bin") if title: - plt.title(title + " " + clname) + plt.title(title + ", " + clname) save_img( "sum_energy_cls{}.png".format(cls_id), epoch, @@ -636,7 +636,7 @@ def plot_sum_energy(yvals, class_names, epoch=None, cp_dir=None, comet_experimen plt.xlabel("total true energy / event [GeV]") plt.ylabel("total PF energy / event [GeV]") if title: - plt.title(title + " " + clname) + plt.title(title + ", " + clname) save_img( "sum_gen_cand_energy_cls{}.png".format(cls_id), epoch, @@ -651,7 +651,7 @@ def plot_sum_energy(yvals, class_names, epoch=None, cp_dir=None, comet_experimen plt.xlabel("total true energy / event [GeV]") plt.ylabel("total MLPF energy / event [GeV]") if title: - plt.title(title + " " + clname) + plt.title(title + ", " + clname) save_img( "sum_gen_pred_energy_cls{}.png".format(cls_id), epoch, @@ -676,7 +676,7 @@ def plot_sum_energy(yvals, class_names, epoch=None, cp_dir=None, comet_experimen plt.xlabel("total true energy / event [GeV]") plt.ylabel("total reconstructed energy / event [GeV]") if title: - plt.title(title + ", PF") + plt.title(title + ", " + clname + ", PF") save_img( "sum_gen_cand_energy_log_cls{}.png".format(cls_id), epoch, @@ -698,7 +698,7 @@ def plot_sum_energy(yvals, class_names, epoch=None, cp_dir=None, comet_experimen plt.xlabel("total true energy / event [GeV]") plt.ylabel("total reconstructed energy / event [GeV]") if title: - plt.title(title + ", MLPF") + plt.title(title + ", " + clname + ", MLPF") save_img( "sum_gen_pred_energy_log_cls{}.png".format(cls_id), epoch, @@ -729,7 +729,7 @@ def plot_particle_multiplicity(X, yvals, class_names, epoch=None, cp_dir=None, c plt.xlim(0, max_val) plt.ylim(0, max_val) if title: - plt.title(title + " " + clname) + plt.title(title + ", " + clname) save_img( "particle_multiplicity_{}.png".format(cls_id), @@ -957,6 +957,8 @@ def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.ylim(0.75, 1.25) plt.axhline(1.0, color="black", ls="--") plt.ylabel("Response median") + if title: + plt.title(title) plt.legend() plt.sca(axs[1]) @@ -964,6 +966,8 @@ def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") plt.ylabel("Response IQR") plt.legend() + if title: + plt.title(title) plt.xlabel("gen-jet $p_T$ [GeV]") plt.tight_layout() @@ -1051,6 +1055,8 @@ def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.ylim(0.75, 1.25) plt.axhline(1.0, color="black", ls="--") plt.ylabel("Response median") + if title: + plt.title(title) plt.legend() plt.sca(axs[1]) @@ -1058,6 +1064,8 @@ def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") plt.ylabel("Response IQR") plt.legend() + if title: + plt.title(title) plt.xlabel("gen MET [GeV]") plt.tight_layout() diff --git a/mlpf/tallinn/eval.sh b/mlpf/tallinn/eval.sh index c9f98205b..c4811fa60 100755 --- a/mlpf/tallinn/eval.sh +++ b/mlpf/tallinn/eval.sh @@ -7,12 +7,13 @@ IMG=/home/software/singularity/tf-2.11.0.simg cd ~/particleflow -EXPDIR=experiments/clic-hits_20230421_213012_921390.gpu1.local +EXPDIR=experiments/clic-hits_20230512_161010_875811.gpu1.local +WEIGHTS=experiments/clic-hits_20230512_161010_875811.gpu1.local/weights/weights-06-0.076698.hdf5 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ $IMG python mlpf/pipeline.py evaluate \ - --train-dir $EXPDIR + --train-dir $EXPDIR --weights $WEIGHTS singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ diff --git a/mlpf/tallinn/mlpf-train-a100.sh b/mlpf/tallinn/mlpf-train-a100.sh index 71aa5305c..b5128c13c 100755 --- a/mlpf/tallinn/mlpf-train-a100.sh +++ b/mlpf/tallinn/mlpf-train-a100.sh @@ -4,11 +4,17 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.11.0.simg +IMG=/home/software/singularity/tf-2.12.0-nvidia.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c $1 --plot-freq 1 --num-cpus 16 --batch-multiplier $2 + $IMG python mlpf/pipeline.py train -c parameters/clic-hits-ln.yaml \ + --plot-freq 1 --num-cpus 32 --batch-multiplier 2 \ + --weights experiments/clic-hits-ln_20230626_123309_931116.gpu1.local/weights/weights-12-0.172574.hdf5 + +# --env TF_GPU_THREAD_MODE=gpu_private \ +# --env TF_GPU_THREAD_COUNT=8 \ +# --env TF_XLA_FLAGS="--tf_xla_auto_jit=2" \ diff --git a/mlpf/tallinn/mlpf-train.sh b/mlpf/tallinn/mlpf-train.sh index a27aff070..4d890a83e 100755 --- a/mlpf/tallinn/mlpf-train.sh +++ b/mlpf/tallinn/mlpf-train.sh @@ -4,11 +4,12 @@ #SBATCH --mem-per-gpu=8G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.11.0.simg +IMG=docker://nvcr.io/nvidia/tensorflow:23.05-tf2-py3 cd ~/particleflow #TF training -singularity exec -B /scratch/persistent --nv \ +singularity exec -B /scratch/persistent -B /local --nv \ --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c $1 --plot-freq 1 --num-cpus 16 --batch-multiplier $2 --weights experiments/clic_20230412_155159_717751.gpu1.local/weights/weights-100-9.948204.hdf5 + --env TFDS_DATA_DIR=/local/joosep/mlpf/tensorflow_datasets \ + --env TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" \ + $IMG python mlpf/pipeline.py train -c parameters/clic-hits.yaml --plot-freq 1 --num-cpus 16 --batch-multiplier 1 --ntrain 100000 --ntest 100000 diff --git a/mlpf/tallinn/postprocessing.sh b/mlpf/tallinn/postprocessing.sh index f3e614b14..730f3e7b4 100755 --- a/mlpf/tallinn/postprocessing.sh +++ b/mlpf/tallinn/postprocessing.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH -p main -#SBATCH --cpus-per-task 30 +#SBATCH --cpus-per-task 16 #SBATCH --mem-per-cpu=1G #SBATCH -o logs/slurm-%x-%j-%N.out diff --git a/mlpf/tallinn/submit_postprocessing.sh b/mlpf/tallinn/submit_postprocessing.sh new file mode 100755 index 000000000..c1686562e --- /dev/null +++ b/mlpf/tallinn/submit_postprocessing.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#sbatch mlpf/tallinn/postprocessing.sh p8_ee_tt_ecm380 +#sbatch mlpf/tallinn/postprocessing.sh p8_ee_qq_ecm380 +#sbatch mlpf/tallinn/postprocessing.sh p8_ee_tt_ecm380_PU10 +#sbatch mlpf/tallinn/postprocessing.sh p8_ee_WW_fullhad_ecm380 +#sbatch mlpf/tallinn/postprocessing.sh p8_ee_ZH_Htautau_ecm380 + +#sbatch mlpf/tallinn/postprocessing_hits.sh p8_ee_tt_ecm380 +#sbatch mlpf/tallinn/postprocessing_hits.sh p8_ee_qq_ecm380 +#sbatch mlpf/tallinn/postprocessing_hits.sh kaon0L +#sbatch mlpf/tallinn/postprocessing_hits.sh pi- +#sbatch mlpf/tallinn/postprocessing_hits.sh pi+ +sbatch mlpf/tallinn/postprocessing_hits.sh pi0 +sbatch mlpf/tallinn/postprocessing_hits.sh e- +sbatch mlpf/tallinn/postprocessing_hits.sh e+ +#sbatch mlpf/tallinn/postprocessing_hits.sh mu- +#sbatch mlpf/tallinn/postprocessing_hits.sh mu+ +sbatch mlpf/tallinn/postprocessing_hits.sh gamma +sbatch mlpf/tallinn/postprocessing_hits.sh neutron diff --git a/mlpf/tfmodel/callbacks.py b/mlpf/tfmodel/callbacks.py index f290460b0..03b4c4b16 100644 --- a/mlpf/tfmodel/callbacks.py +++ b/mlpf/tfmodel/callbacks.py @@ -2,6 +2,7 @@ import pickle from datetime import datetime from pathlib import Path +import time import matplotlib.pyplot as plt import numpy as np @@ -58,6 +59,7 @@ def _collect_learning_rate(self, logs): def on_epoch_end(self, epoch, logs): logs = logs or {} logs.update(self._collect_learning_rate(logs)) + logs["time"] = time.time() if self.dump_history: history_path = Path(self.log_dir) / "history" history_path.mkdir(parents=True, exist_ok=True) diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py index 05970b800..7fd7cbdb5 100644 --- a/mlpf/tfmodel/model.py +++ b/mlpf/tfmodel/model.py @@ -16,16 +16,30 @@ def debugging_train_step(self, data): print("data", data[0].shape, [(k, v.shape) for (k, v) in data[1].items()]) with tf.GradientTape() as tape: - y_pred = self(x, training=True) # Forward pass - loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses) + y_pred = self(x, training=True) # Forward passa + + tf.print("predictions and targets") + for k in y_pred.keys(): + tf.print(k, y_pred[k].shape, y[k].shape) + + tf.print("loss shapes") + for k in self.compiled_loss._user_losses.keys(): + tf.print(k, self.compiled_loss._user_losses[k]) + tf.print(self.compiled_loss._user_losses[k](y[k], y_pred[k]).shape) + + tf.print("sample weights") + for k in sample_weights.keys(): + tf.print(k, sample_weights[k].shape) + + loss = self.compiled_loss(y, y_pred, sample_weights) # , regularization_losses=self.losses) trainable_vars = self.trainable_variables gradients = tape.gradient(loss, trainable_vars) - print("Max of Gradients[0]: %.4f" % tf.reduce_max(gradients[0])) - print("Min of Gradients[0]: %.4f" % tf.reduce_min(gradients[0])) - print("Mean of Gradients[0]: %.4f" % tf.reduce_mean(gradients[0])) - print("Loss: %.4f" % loss) + print("Max of Gradients[0]: {:.4f}".format(tf.reduce_max(gradients[0]))) + print("Min of Gradients[0]: {:.4f}".format(tf.reduce_min(gradients[0]))) + print("Mean of Gradients[0]: {:.4f}".format(tf.reduce_mean(gradients[0]))) + print("Loss: {}".format(loss)) self.optimizer.apply_gradients(zip(gradients, trainable_vars)) self.compiled_metrics.update_state(y, y_pred) @@ -51,12 +65,16 @@ def debugging_test_step(self, data): return {m.name: m.result() for m in self.metrics} +# @tf.function(jit_compile=True) +@tf.function def split_indices_to_bins_batch(cmul, nbins, bin_size, msk): bin_idx = tf.argmax(cmul, axis=-1) + tf.cast(tf.where(~msk, nbins - 1, 0), tf.int64) bins_split = tf.reshape(tf.argsort(bin_idx), (tf.shape(cmul)[0], nbins, bin_size)) return bins_split +# @tf.function(jit_compile=True) +@tf.function def pairwise_l2_dist(A, B): na = tf.reduce_sum(tf.square(A), -1) nb = tf.reduce_sum(tf.square(B), -1) @@ -115,17 +133,18 @@ def pairwise_sigmoid_dist(A, B): """ +# @tf.function(jit_compile=True) @tf.function def reverse_lsh(bins_split, points_binned_enc, small_graph_opt=False): - tf.debugging.assert_shapes( - [ - (bins_split, ("n_batch", "n_bins", "n_points_bin")), - ( - points_binned_enc, - ("n_batch", "n_bins", "n_points_bin", "n_features"), - ), - ] - ) + # tf.debugging.assert_shapes( + # [ + # (bins_split, ("n_batch", "n_bins", "n_points_bin")), + # ( + # points_binned_enc, + # ("n_batch", "n_bins", "n_points_bin", "n_features"), + # ), + # ] + # ) shp = tf.shape(points_binned_enc) n_bins = shp[1] @@ -158,11 +177,11 @@ def single_bin(): else: ret = multiple_bins() - tf.debugging.assert_shapes( - [ - (ret, ("n_batch", "n_elems", "n_features")), - ] - ) + # tf.debugging.assert_shapes( + # [ + # (ret, ("n_batch", "n_elems", "n_features")), + # ] + # ) return ret @@ -175,7 +194,6 @@ def __init__(self, num_input_classes): X: [Nbatch, Nelem, Nfeat] array of all the input detector element feature data """ - @tf.function def call(self, X): # X[:, :, 0] - categorical index of the element type @@ -198,7 +216,6 @@ def __init__(self, num_input_classes): X: [Nbatch, Nelem, Nfeat] array of all the input detector element feature data """ - @tf.function def call(self, X): # X[:, :, 0] - categorical index of the element type @@ -290,6 +307,7 @@ def __init__(self, *args, **kwargs): self.activation = getattr(tf.keras.activations, kwargs.pop("activation")) self.output_dim = kwargs.pop("output_dim") self.normalize_degrees = kwargs.pop("normalize_degrees", True) + self.initializer = kwargs.pop("initializer", "random_normal") super(GHConvDense, self).__init__(*args, **kwargs) @@ -299,28 +317,28 @@ def build(self, input_shape): self.W_t = self.add_weight( shape=(self.hidden_dim, self.output_dim), name="w_t", - initializer="random_normal", + initializer=self.initializer, trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight), ) self.b_t = self.add_weight( shape=(self.output_dim,), name="b_t", - initializer="random_normal", + initializer=self.initializer, trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight), ) self.W_h = self.add_weight( shape=(self.hidden_dim, self.output_dim), name="w_h", - initializer="random_normal", + initializer=self.initializer, trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight), ) self.theta = self.add_weight( shape=(self.hidden_dim, self.output_dim), name="theta", - initializer="random_normal", + initializer=self.initializer, trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight), ) @@ -332,7 +350,7 @@ def call(self, inputs): # tf.print("GHConvDense.call:msk", msk.shape) # remove last dim from distance/adjacency matrix - tf.debugging.assert_equal(tf.shape(adj)[-1], 1) + # tf.debugging.assert_equal(tf.shape(adj)[-1], 1) adj = tf.squeeze(adj, axis=-1) # compute the normalization of the adjacency matrix @@ -352,20 +370,20 @@ def call(self, inputs): gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t) out = gate * f_hom + (1.0 - gate) * f_het - tf.debugging.assert_shapes( - [ - (x, ("n_batch", "n_bins", "n_points_bin", "num_features")), - ( - adj, - ("n_batch", "n_bins", "n_points_bin", "n_points_bin"), - ), - (msk, ("n_batch", "n_bins", "n_points_bin", 1)), - ( - out, - ("n_batch", "n_bins", "n_points_bin", self.output_dim), - ), - ] - ) + # tf.debugging.assert_shapes( + # [ + # (x, ("n_batch", "n_bins", "n_points_bin", "num_features")), + # ( + # adj, + # ("n_batch", "n_bins", "n_points_bin", "n_points_bin"), + # ), + # (msk, ("n_batch", "n_bins", "n_points_bin", 1)), + # ( + # out, + # ("n_batch", "n_bins", "n_points_bin", self.output_dim), + # ), + # ] + # ) # tf.print("GHConvDense.call:out", out.shape) return self.activation(out) * msk @@ -558,6 +576,7 @@ def __init__( self.bin_size = bin_size self.kernel = kernel self.small_graph_opt = small_graph_opt + self.initializer = kwargs.pop("initializer", "random_normal") super(MessageBuildingLayerLSH, self).__init__(**kwargs) @@ -567,7 +586,7 @@ def build(self, input_shape): # generate the LSH codebook for random rotations (num_features, max_num_bins/2) self.codebook_random_rotations = self.add_weight( shape=(self.distance_dim, self.max_num_bins // 2), - initializer="random_normal", + initializer=self.initializer, trainable=False, name="lsh_projections", ) @@ -580,13 +599,13 @@ def build(self, input_shape): def call(self, x_msg, x_node, msk, training=False): msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1) - tf.debugging.assert_shapes( - [ - (x_msg, ("n_batch", "n_points", "n_msg_features")), - (x_node, ("n_batch", "n_points", "n_node_features")), - (msk_f, ("n_batch", "n_points", 1)), - ] - ) + # tf.debugging.assert_shapes( + # [ + # (x_msg, ("n_batch", "n_points", "n_msg_features")), + # (x_node, ("n_batch", "n_points", "n_node_features")), + # (msk_f, ("n_batch", "n_points", 1)), + # ] + # ) shp = tf.shape(x_msg) n_points = shp[1] @@ -605,16 +624,16 @@ def dobin(): # n_points must be divisible by bin_size exactly due to the use of reshape n_bins = tf.math.floordiv(n_points, self.bin_size) - tf.debugging.assert_greater( - n_bins, - 0, - "number of points (dim 1) must be greater than bin_size={}".format(self.bin_size), - ) - tf.debugging.assert_equal( - tf.math.floormod(n_points, self.bin_size), - 0, - "number of points (dim 1) must be an integer multiple of bin_size={}".format(self.bin_size), - ) + # tf.debugging.assert_greater( + # n_bins, + # 0, + # "number of points (dim 1) must be greater than bin_size={}".format(self.bin_size), + # ) + # tf.debugging.assert_equal( + # tf.math.floormod(n_points, self.bin_size), + # 0, + # "number of points (dim 1) must be an integer multiple of bin_size={}".format(self.bin_size), + # ) mul = tf.linalg.matmul( x_msg, self.codebook_random_rotations[:, : tf.math.maximum(1, n_bins // 2)], @@ -674,39 +693,39 @@ def nobin(): msk_col = tf.cast(tf.reshape(msk_f_binned_squeeze, rshp_col), dm.dtype) dm = tf.math.multiply(dm, msk_row) dm = tf.math.multiply(dm, msk_col) - tf.debugging.assert_shapes( - [ - ( - x_msg_binned, - ( - "n_batch", - "n_bins", - "n_points_bin", - "n_msg_features", - ), - ), - ( - x_features_binned, - ( - "n_batch", - "n_bins", - "n_points_bin", - "n_node_features", - ), - ), - (msk_f_binned, ("n_batch", "n_bins", "n_points_bin", 1)), - ( - dm, - ( - "n_batch", - "n_bins", - "n_points_bin", - "n_points_bin", - 1, - ), - ), - ] - ) + # tf.debugging.assert_shapes( + # [ + # ( + # x_msg_binned, + # ( + # "n_batch", + # "n_bins", + # "n_points_bin", + # "n_msg_features", + # ), + # ), + # ( + # x_features_binned, + # ( + # "n_batch", + # "n_bins", + # "n_points_bin", + # "n_node_features", + # ), + # ), + # (msk_f_binned, ("n_batch", "n_bins", "n_points_bin", 1)), + # ( + # dm, + # ( + # "n_batch", + # "n_bins", + # "n_points_bin", + # "n_points_bin", + # 1, + # ), + # ), + # ] + # ) return bins_split, x_features_binned, dm, msk_f_binned @@ -1057,44 +1076,44 @@ def call(self, x, msk, training=False): # tf.print("CombinedGraphLayer.call:dm", dm.shape) # tf.print("CombinedGraphLayer.call:msk_f", msk_f.shape) - tf.debugging.assert_shapes( - [ - (bins_split, ("n_batch", "n_bins", "n_points_bin")), - ( - x, - ( - "n_batch", - "n_bins", - "n_points_bin", - "n_node_features", - ), - ), - ( - dm, - ( - "n_batch", - "n_bins", - "n_points_bin", - "n_points_bin", - 1, - ), - ), - (msk_f, ("n_batch", "n_bins", "n_points_bin", 1)), - ] - ) + # tf.debugging.assert_shapes( + # [ + # (bins_split, ("n_batch", "n_bins", "n_points_bin")), + # ( + # x, + # ( + # "n_batch", + # "n_bins", + # "n_points_bin", + # "n_node_features", + # ), + # ), + # ( + # dm, + # ( + # "n_batch", + # "n_bins", + # "n_points_bin", + # "n_points_bin", + # 1, + # ), + # ), + # (msk_f, ("n_batch", "n_bins", "n_points_bin", 1)), + # ] + # ) # run the node update with message passing for msg in self.message_passing_layers: x_out = msg((x, dm, msk_f)) - tf.debugging.assert_shapes( - [ - (x, ("n_batch", "n_bins", "n_points_bin", "feat_in")), - ( - x_out, - ("n_batch", "n_bins", "n_points_bin", "feat_out"), - ), - ] - ) + # tf.debugging.assert_shapes( + # [ + # (x, ("n_batch", "n_bins", "n_points_bin", "feat_in")), + # ( + # x_out, + # ("n_batch", "n_bins", "n_points_bin", "feat_out"), + # ), + # ] + # ) x = x_out if self.dropout_layer: x = self.dropout_layer(x, training=training) @@ -1194,7 +1213,10 @@ def __init__( def call(self, inputs, training=False): Xorig = inputs + # tf.print(tf.shape(Xorig)) + # normalize all features except the PFElement type (feature 0) + # X = Xorig X = tf.concat([Xorig[:, :, 0:1], tf.cast(self.normalizer(Xorig[:, :, 1:]), dtype=Xorig.dtype)], axis=-1) X = tf.where(tf.math.is_inf(X), tf.zeros_like(X), X) @@ -1418,12 +1440,12 @@ def __init__( event_set_output=False, met_output=False, cls_output_as_logits=False, - num_layers_encoder=2, - num_layers_decoder_reg=2, - num_layers_decoder_cls=2, + num_layers_encoder=4, + num_layers_decoder_reg=4, + num_layers_decoder_cls=4, hidden_dim=256, num_heads=8, - num_random_features=128, + num_random_features=256, ): super(PFNetTransformer, self).__init__() @@ -1492,6 +1514,7 @@ def __init__( def call(self, inputs, training=False): Xorig = inputs + # X = Xorig X = tf.concat([Xorig[:, :, 0:1], self.normalizer(Xorig[:, :, 1:])], axis=-1) # tf.print("\nX.shape=", tf.shape(X), "\n") @@ -1513,6 +1536,7 @@ def call(self, inputs, training=False): for enc in self.encoders: X_enc = enc([X_enc, X_enc, msk], training=training) * msk_input + # initialize the classification and regression latent state with identity X_cls = tf.identity(X_enc) X_reg = tf.identity(X_enc) diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py index 0a4c5c2a3..95ddbc437 100644 --- a/mlpf/tfmodel/model_setup.py +++ b/mlpf/tfmodel/model_setup.py @@ -11,11 +11,11 @@ import pickle from pathlib import Path +import time import awkward import fastjet import numpy as np import tensorflow as tf -import tensorflow_addons as tfa import vector from plotting.plot_utils import ( compute_distances, @@ -223,15 +223,16 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h is_hpo_run=is_hpo_run, ) - callbacks += [cb] + if config.get("do_validation_callback", True): + callbacks += [cb] + tb = CustomTensorBoard( log_dir=outdir + "/logs", histogram_freq=config["callbacks"]["tensorboard"]["hist_freq"], write_graph=False, write_images=False, update_freq="batch", - # profile_batch=(10,200), - profile_batch=0, + # profile_batch=(50,100), dump_history=config["callbacks"]["tensorboard"]["dump_history"], ) # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it @@ -470,6 +471,27 @@ def model_output(ret): full_model = tf.function(lambda x: model_output(model(x, training=False))) + niter = 10 + nfeat = config["dataset"]["num_input_features"] + + if "combined_graph_layer" in config["parameters"]: + bin_size = config["parameters"]["combined_graph_layer"]["bin_size"] + elem_range = list(range(bin_size, 5 * bin_size, bin_size)) + else: + elem_range = range(100, 1000, 200) + + for ibatch in [1, 2, 4]: + for nptcl in elem_range: + X = np.random.rand(ibatch, nptcl, nfeat) + full_model(X) + + t0 = time.time() + for i in range(niter): + full_model(X) + t1 = time.time() + + print(ibatch, nptcl, (t1 - t0) / niter) + # we need to use opset 12 for the version of ONNXRuntime in CMSSW # the warnings "RuntimeError: Opset (12) must be >= 13 for operator 'batch_dot'." do not seem to be critical model_proto, _ = tf2onnx.convert.from_function( @@ -523,7 +545,10 @@ def configure_model_weights(model, trainable_layers): def make_focal_loss(config): def loss(x, y): - return tfa.losses.sigmoid_focal_crossentropy( + + from .tfa import sigmoid_focal_crossentropy + + return sigmoid_focal_crossentropy( x, y, alpha=float(config["setup"].get("focal_loss_alpha", 0.25)), diff --git a/mlpf/tfmodel/tfa.py b/mlpf/tfmodel/tfa.py new file mode 100644 index 000000000..7f45399eb --- /dev/null +++ b/mlpf/tfmodel/tfa.py @@ -0,0 +1,168 @@ +import tensorflow as tf +import tensorflow.keras.backend as K + + +def is_tensor_or_variable(x): + return tf.is_tensor(x) or isinstance(x, tf.Variable) + + +class LossFunctionWrapper(tf.keras.losses.Loss): + """Wraps a loss function in the `Loss` class.""" + + def __init__(self, fn, reduction=tf.keras.losses.Reduction.AUTO, name=None, **kwargs): + """Initializes `LossFunctionWrapper` class. + + Args: + fn: The loss function to wrap, with signature `fn(y_true, y_pred, + **kwargs)`. + reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to + loss. Default value is `AUTO`. `AUTO` indicates that the reduction + option will be determined by the usage context. For almost all cases + this defaults to `SUM_OVER_BATCH_SIZE`. When used with + `tf.distribute.Strategy`, outside of built-in training loops such as + `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE` + will raise an error. Please see this custom training [tutorial]( + https://www.tensorflow.org/tutorials/distribute/custom_training) + for more details. + name: (Optional) name for the loss. + **kwargs: The keyword arguments that are passed on to `fn`. + """ + super().__init__(reduction=reduction, name=name) + self.fn = fn + self._fn_kwargs = kwargs + + def call(self, y_true, y_pred): + """Invokes the `LossFunctionWrapper` instance. + + Args: + y_true: Ground truth values. + y_pred: The predicted values. + + Returns: + Loss values per sample. + """ + return self.fn(y_true, y_pred, **self._fn_kwargs) + + def get_config(self): + config = {} + for k, v in iter(self._fn_kwargs.items()): + config[k] = tf.keras.backend.eval(v) if is_tensor_or_variable(v) else v + base_config = super().get_config() + return {**base_config, **config} + + +class SigmoidFocalCrossEntropy(LossFunctionWrapper): + """Implements the focal loss function. + + Focal loss was first introduced in the RetinaNet paper + (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for + classification when you have highly imbalanced classes. It down-weights + well-classified examples and focuses on hard examples. The loss value is + much higher for a sample which is misclassified by the classifier as compared + to the loss value corresponding to a well-classified example. One of the + best use-cases of focal loss is its usage in object detection where the + imbalance between the background class and other classes is extremely high. + + Usage: + + >>> fl = tfa.losses.SigmoidFocalCrossEntropy() + >>> loss = fl( + ... y_true = [[1.0], [1.0], [0.0]],y_pred = [[0.97], [0.91], [0.03]]) + >>> loss + + + Usage with `tf.keras` API: + + >>> model = tf.keras.Model() + >>> model.compile('sgd', loss=tfa.losses.SigmoidFocalCrossEntropy()) + + Args: + alpha: balancing factor, default value is 0.25. + gamma: modulating factor, default value is 2.0. + + Returns: + Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same + shape as `y_true`; otherwise, it is scalar. + + Raises: + ValueError: If the shape of `sample_weight` is invalid or value of + `gamma` is less than zero. + """ + + def __init__( + self, + from_logits: bool = False, + alpha=0.25, + gamma=2.0, + reduction: str = tf.keras.losses.Reduction.NONE, + name: str = "sigmoid_focal_crossentropy", + ): + super().__init__( + sigmoid_focal_crossentropy, + name=name, + reduction=reduction, + from_logits=from_logits, + alpha=alpha, + gamma=gamma, + ) + + +@tf.function +def sigmoid_focal_crossentropy( + y_true, + y_pred, + alpha=0.25, + gamma=2.0, + from_logits: bool = False, +) -> tf.Tensor: + """Implements the focal loss function. + + Focal loss was first introduced in the RetinaNet paper + (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for + classification when you have highly imbalanced classes. It down-weights + well-classified examples and focuses on hard examples. The loss value is + much higher for a sample which is misclassified by the classifier as compared + to the loss value corresponding to a well-classified example. One of the + best use-cases of focal loss is its usage in object detection where the + imbalance between the background class and other classes is extremely high. + + Args: + y_true: true targets tensor. + y_pred: predictions tensor. + alpha: balancing factor. + gamma: modulating factor. + + Returns: + Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the + same shape as `y_true`; otherwise, it is scalar. + """ + if gamma and gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + + y_pred = tf.convert_to_tensor(y_pred) + y_true = tf.cast(y_true, dtype=y_pred.dtype) + + # Get the cross_entropy for each entry + ce = K.binary_crossentropy(y_true, y_pred, from_logits=from_logits) + + # If logits are provided then convert the predictions into probabilities + if from_logits: + pred_prob = tf.sigmoid(y_pred) + else: + pred_prob = y_pred + + p_t = (y_true * pred_prob) + ((1 - y_true) * (1 - pred_prob)) + alpha_factor = 1.0 + modulating_factor = 1.0 + + if alpha: + alpha = tf.cast(alpha, dtype=y_true.dtype) + alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha) + + if gamma: + gamma = tf.cast(gamma, dtype=y_true.dtype) + modulating_factor = tf.pow((1.0 - p_t), gamma) + + # compute the final loss and return + return tf.reduce_sum(alpha_factor * modulating_factor * ce, axis=-1) diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py index 602d2c963..aa382a7e1 100644 --- a/mlpf/tfmodel/utils.py +++ b/mlpf/tfmodel/utils.py @@ -15,7 +15,6 @@ import tensorflow as tf -import tensorflow_addons as tfa import yaml from tensorflow.keras import mixed_precision @@ -196,13 +195,10 @@ def get_strategy(num_cpus=None): tf.config.threading.set_inter_op_parallelism_threads(num_cpus) tf.config.threading.set_intra_op_parallelism_threads(num_cpus) - device = "cpu" if "CUDA_VISIBLE_DEVICES" in os.environ: num_gpus, gpus = get_num_gpus("CUDA_VISIBLE_DEVICES") - device = "cuda" elif "ROCR_VISIBLE_DEVICES" in os.environ: num_gpus, gpus = get_num_gpus("ROCR_VISIBLE_DEVICES") - device = "roc" else: logging.warning( "CUDA/ROC variable is empty. \ @@ -213,13 +209,7 @@ def get_strategy(num_cpus=None): if num_gpus > 1: # multiple GPUs selected logging.info("Attempting to use multiple GPUs with tf.distribute.MirroredStrategy()...") - - # For ROCM devices, I was getting errors from Adam/NcclAllReduce on multiple GPUs - cross_device_ops = tf.distribute.NcclAllReduce() - if device == "roc": - cross_device_ops = tf.distribute.HierarchicalCopyAllReduce() - - strategy = tf.distribute.MirroredStrategy(cross_device_ops=cross_device_ops) + strategy = tf.distribute.MirroredStrategy() elif num_gpus == 1: # single GPU logging.info("Using a single GPU with tf.distribute.OneDeviceStrategy()") @@ -292,13 +282,6 @@ def get_optimizer(config, lr_schedule=None): cfg_adam = config["optimizer"]["adam"] opt = tf.keras.optimizers.legacy.Adam(learning_rate=lr, amsgrad=cfg_adam["amsgrad"]) return opt - elif config["setup"]["optimizer"] == "adamw": - cfg_adamw = config["optimizer"]["adamw"] - return tfa.optimizers.AdamW( - learning_rate=lr, - weight_decay=cfg_adamw["weight_decay"], - amsgrad=cfg_adamw["amsgrad"], - ) elif config["setup"]["optimizer"] == "sgd": cfg_sgd = config["optimizer"]["sgd"] return tf.keras.optimizers.legacy.SGD( @@ -389,9 +372,22 @@ def load_and_interleave( # use dynamic batching depending on the sequence length if config["batching"]["bucket_by_sequence_length"]: - bucket_batch_sizes = [[float(v) for v in x.split(",")] for x in config["batching"]["bucket_batch_sizes"]] + if config["batching"]["bucket_batch_sizes"] == "auto": + if "combined_graph_layer" in config["parameters"]: + bin_size = config["parameters"]["combined_graph_layer"]["bin_size"] + else: + bin_size = 256 + + # generate (max_elems, batch_size) pairs + # scale from bin_size to max_elems in steps of bin_size + max_elems = 75 * bin_size + max_n = 75 + reduction_factor = 125 + bucket_batch_sizes = [(bin_size * (n + 1) + 1, (max_elems) / (n + 1) // reduction_factor) for n in range(max_n)] + else: + bucket_batch_sizes = [[float(v) for v in x.split(",")] for x in config["batching"]["bucket_batch_sizes"]] - assert bucket_batch_sizes[-1][0] == float("inf") + # assert bucket_batch_sizes[-1][0] == float("inf") bucket_boundaries = [int(x[0]) for x in bucket_batch_sizes[:-1]] bucket_batch_sizes = [ @@ -408,6 +404,7 @@ def load_and_interleave( bucket_boundaries=bucket_boundaries, # for multi-GPU, we need to multiply the batch size by the number of GPUs bucket_batch_sizes=bucket_batch_sizes, + pad_to_bucket_boundary=True, drop_remainder=True, ) # use fixed-size batching @@ -479,27 +476,16 @@ def set_config_loss(config, trainable): return config -def get_class_loss(config): - if config["setup"]["classification_loss_type"] == "categorical_cross_entropy": - cls_loss = tf.keras.losses.CategoricalCrossentropy( - from_logits=False, - label_smoothing=config["setup"].get("classification_label_smoothing", 0.0), - ) - elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy": - cls_loss = tfa.losses.sigmoid_focal_crossentropy - else: - raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"])) - return cls_loss - - def get_loss_from_params(input_dict): input_dict = input_dict.copy() loss_type = input_dict.pop("type") - if loss_type == "PinballLoss": - loss_cls = getattr(tfa.losses, loss_type) + if loss_type == "SigmoidFocalCrossEntropy": + from .tfa import SigmoidFocalCrossEntropy + + loss_cls = SigmoidFocalCrossEntropy else: loss_cls = getattr(tf.keras.losses, loss_type) - return loss_cls(**input_dict) + return loss_cls(**input_dict, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE) # batched version of https://github.com/VinAIResearch/DSW/blob/master/gsw.py#L19 @@ -677,7 +663,7 @@ def gen_jet_logcosh_loss(y_true, y_pred): def get_loss_dict(config): - cls_loss = get_class_loss(config) + cls_loss = get_loss_from_params(config["loss"].get("cls_loss")) default_loss = {"type": "MeanSquaredError"} loss_dict = { @@ -759,6 +745,11 @@ def model_scope(config, total_steps, weights=None, horovod_enabled=False): policy = mixed_precision.Policy("mixed_float16") mixed_precision.set_global_policy(policy) opt = mixed_precision.LossScaleOptimizer(opt) + elif config["setup"]["dtype"] == "bfloat16": + model_dtype = tf.dtypes.bfloat16 + policy = mixed_precision.Policy("mixed_bfloat16") + mixed_precision.set_global_policy(policy) + opt = mixed_precision.LossScaleOptimizer(opt) else: model_dtype = tf.dtypes.float32 diff --git a/notebooks/clic-visualize.ipynb b/notebooks/clic-visualize.ipynb index c734655f5..2be2ca68c 100644 --- a/notebooks/clic-visualize.ipynb +++ b/notebooks/clic-visualize.ipynb @@ -225,7 +225,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -239,7 +239,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/notebooks/clic.ipynb b/notebooks/clic.ipynb index 59278b2c9..7d8ea0724 100644 --- a/notebooks/clic.ipynb +++ b/notebooks/clic.ipynb @@ -135,6 +135,24 @@ " \"X\": X[msk],\n", " \"ygen\": ygen[msk],\n", " \"ycand\": ycand[msk]\n", + " }\n", + "\n", + "def load_data_hits(path, num_files):\n", + " ret = []\n", + " filelist = list(glob.glob(path))[:num_files]\n", + " print(len(filelist))\n", + "\n", + " X_hit = []\n", + "\n", + " for fn in tqdm.tqdm(filelist):\n", + " dd = ak.from_parquet(fn)\n", + "\n", + " X_hit.append(dd[\"X_hit\"])\n", + " \n", + " X_hit = ak.concatenate(X_hit)\n", + "\n", + " return {\n", + " \"X_hit\": X_hit,\n", " }" ] }, @@ -151,6 +169,27 @@ "data_ww = load_data(\"/media/joosep/data/mlpf/clic_edm4hep_2023_02_27/p8_ee_WW_fullhad_ecm380/*.parquet\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "8622c8fc", + "metadata": {}, + "outputs": [], + "source": [ + "data_tt_pu10 = load_data(\"/media/joosep/data/mlpf/clic_edm4hep_2023_03_03/p8_ee_tt_ecm380_PU10/*.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af6ce420", + "metadata": {}, + "outputs": [], + "source": [ + "data_tt_hits = load_data_hits(\"/media/joosep/data/mlpf_hits/clic_edm4hep/p8_ee_tt_ecm380/*.parquet\", 100)\n", + "data_qq_hits = load_data_hits(\"/media/joosep/data/mlpf_hits/clic_edm4hep/p8_ee_qq_ecm380/*.parquet\", 100)" + ] + }, { "cell_type": "markdown", "id": "1a5ecc39", @@ -168,21 +207,26 @@ }, "outputs": [], "source": [ - "b = np.linspace(0, 100, 51)\n", + "b = np.linspace(0, 200, 101)\n", "\n", "h1 = to_bh(ak.num(data_tt[\"X_track\"]), b)\n", "h2 = to_bh(ak.num(data_qcd[\"X_track\"]), b)\n", "h3 = to_bh(ak.num(data_zh[\"X_track\"]), b)\n", "h4 = to_bh(ak.num(data_ww[\"X_track\"]), b)\n", + "h5 = to_bh(ak.num(data_tt_pu10[\"X_track\"]), b)\n", + "\n", + "fig = plt.figure()\n", + "ax = plt.axes()\n", "\n", "mplhep.histplot(h1, histtype=\"step\", lw=2, label=label_tt)\n", "mplhep.histplot(h2, histtype=\"step\", lw=2, label=label_qq)\n", "mplhep.histplot(h3, histtype=\"step\", lw=2, label=label_zh)\n", "mplhep.histplot(h4, histtype=\"step\", lw=2, label=label_ww)\n", + "mplhep.histplot(h5*10, histtype=\"step\", lw=2, label=label_tt + \" PU10\")\n", "plt.xlabel(\"Number of tracks / event\")\n", "plt.ylabel(\"Number of events\")\n", "plt.legend()\n", - "plt.ylim(0, 15*num_files)\n", + "plt.ylim(0, 10*num_files)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", "plt.savefig(\"plots/clic/num_tracks.pdf\")" ] @@ -194,17 +238,20 @@ "metadata": {}, "outputs": [], "source": [ - "b = np.linspace(0, 200, 51)\n", + "b = np.linspace(0, 500, 101)\n", "\n", "h1 = to_bh(ak.num(data_tt[\"X_cluster\"]), b)\n", "h2 = to_bh(ak.num(data_qcd[\"X_cluster\"]), b)\n", "h3 = to_bh(ak.num(data_zh[\"X_cluster\"]), b)\n", "h4 = to_bh(ak.num(data_ww[\"X_cluster\"]), b)\n", + "h5 = to_bh(ak.num(data_tt_pu10[\"X_cluster\"]), b)\n", "\n", "mplhep.histplot(h1, histtype=\"step\", lw=2, label=label_tt)\n", "mplhep.histplot(h2, histtype=\"step\", lw=2, label=label_qq)\n", "mplhep.histplot(h3, histtype=\"step\", lw=2, label=label_zh)\n", "mplhep.histplot(h4, histtype=\"step\", lw=2, label=label_ww)\n", + "mplhep.histplot(h5*10, histtype=\"step\", lw=2, label=label_tt + \" PU10\")\n", + "\n", "plt.xlabel(\"Number of clusters / event\")\n", "plt.ylabel(\"Number of events\")\n", "plt.legend()\n", @@ -216,71 +263,87 @@ { "cell_type": "code", "execution_count": null, - "id": "4ea58410", + "id": "6d0bad8b", "metadata": {}, "outputs": [], "source": [ - "gen_pt1 = ak.flatten(data_tt[\"ygen\"][data_tt[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", - "gen_pt2 = ak.flatten(data_qcd[\"ygen\"][data_qcd[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", - "gen_pt3 = ak.flatten(data_zh[\"ygen\"][data_zh[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", - "gen_pt4 = ak.flatten(data_ww[\"ygen\"][data_ww[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", - "\n", - "b = np.logspace(-2,3,100)\n", - "h1 = to_bh(gen_pt1, b)\n", - "h2 = to_bh(gen_pt2, b)\n", - "h3 = to_bh(gen_pt3, b)\n", - "h4 = to_bh(gen_pt4, b)\n", + "b = np.linspace(0, 15000, 101)\n", "\n", - "fig = plt.figure()\n", - "ax = plt.axes()\n", + "h1 = to_bh(ak.num(data_tt_hits[\"X_hit\"]), b)\n", + "h2 = to_bh(ak.num(data_qq_hits[\"X_hit\"]), b)\n", "\n", "mplhep.histplot(h1, histtype=\"step\", lw=2, label=label_tt)\n", "mplhep.histplot(h2, histtype=\"step\", lw=2, label=label_qq)\n", - "mplhep.histplot(h3, histtype=\"step\", lw=2, label=label_zh)\n", - "mplhep.histplot(h4, histtype=\"step\", lw=2, label=label_ww)\n", - "plt.xscale(\"log\")\n", - "plt.xlabel(\"particle $p_T$ [GeV]\")\n", - "plt.ylabel(\"Number of particles / bin\")\n", - "plt.text(0.03, 0.97, \"stable generator particles\", transform=ax.transAxes, va=\"top\", ha=\"left\")\n", + "\n", + "plt.xlabel(\"Number of calorimeter hits / event\")\n", + "plt.ylabel(\"Number of events\")\n", "plt.legend()\n", - "plt.ylim(0,500*num_files)\n", + "plt.ylim(0,500)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", - "plt.savefig(\"plots/clic/gen_particle_pt.pdf\")" + "plt.savefig(\"plots/clic/num_hits.pdf\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "f0a55d22", - "metadata": {}, + "id": "4ea58410", + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ - "gen_pt1 = ak.flatten(data_tt[\"ycand\"][data_tt[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", - "gen_pt2 = ak.flatten(data_qcd[\"ycand\"][data_qcd[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", - "gen_pt3 = ak.flatten(data_zh[\"ycand\"][data_zh[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", - "gen_pt4 = ak.flatten(data_ww[\"ycand\"][data_ww[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", + "gen_pt1 = ak.flatten(data_tt[\"ygen\"][data_tt[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", + "gen_pt2 = ak.flatten(data_qcd[\"ygen\"][data_qcd[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", + "gen_pt3 = ak.flatten(data_zh[\"ygen\"][data_zh[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", + "gen_pt4 = ak.flatten(data_ww[\"ygen\"][data_ww[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", + "gen_pt5 = ak.flatten(data_tt_pu10[\"ygen\"][data_tt_pu10[\"ygen\"][:, :, 0]!=0][:, :, 2])\n", + "\n", + "cand_pt1 = ak.flatten(data_tt[\"ycand\"][data_tt[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", + "cand_pt2 = ak.flatten(data_qcd[\"ycand\"][data_qcd[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", + "cand_pt3 = ak.flatten(data_zh[\"ycand\"][data_zh[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", + "cand_pt4 = ak.flatten(data_ww[\"ycand\"][data_ww[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", + "cand_pt5 = ak.flatten(data_tt_pu10[\"ycand\"][data_tt_pu10[\"ycand\"][:, :, 0]!=0][:, :, 2])\n", + "\n", "\n", "b = np.logspace(-2,3,100)\n", "h1 = to_bh(gen_pt1, b)\n", "h2 = to_bh(gen_pt2, b)\n", "h3 = to_bh(gen_pt3, b)\n", - "# h2 = to_bh(cand_pt, b)\n", + "h4 = to_bh(gen_pt4, b)\n", + "h5 = to_bh(gen_pt5, b)\n", + "\n", + "h1c = to_bh(cand_pt1, b)\n", + "h2c = to_bh(cand_pt2, b)\n", + "h3c = to_bh(cand_pt3, b)\n", + "h4c = to_bh(cand_pt4, b)\n", + "h5c = to_bh(cand_pt5, b)\n", "\n", "fig = plt.figure()\n", "ax = plt.axes()\n", "\n", - "mplhep.histplot(h1, histtype=\"step\", lw=2, label=label_tt)\n", - "mplhep.histplot(h2, histtype=\"step\", lw=2, label=label_qq)\n", - "mplhep.histplot(h3, histtype=\"step\", lw=2, label=label_zh)\n", - "mplhep.histplot(h4, histtype=\"step\", lw=2, label=label_ww)\n", + "prev = mplhep.histplot(h1, histtype=\"step\", lw=1, label=label_tt, ls=\"--\")\n", + "mplhep.histplot(h1c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "\n", + "prev = mplhep.histplot(h2, histtype=\"step\", lw=1, label=label_qq, ls=\"--\")\n", + "mplhep.histplot(h2c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "\n", + "prev = mplhep.histplot(h3, histtype=\"step\", lw=1, label=label_zh, ls=\"--\")\n", + "mplhep.histplot(h3c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "\n", + "prev = mplhep.histplot(h4, histtype=\"step\", lw=1, label=label_ww, ls=\"--\")\n", + "mplhep.histplot(h4c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "\n", + "prev = mplhep.histplot(h5, histtype=\"step\", lw=1, label=label_tt + \" PU10\", ls=\"--\")\n", + "mplhep.histplot(h5c, histtype=\"step\", lw=2, color=prev[0].errorbar.get_children()[0].get_color())\n", + "\n", "plt.xscale(\"log\")\n", "plt.xlabel(\"particle $p_T$ [GeV]\")\n", "plt.ylabel(\"Number of particles / bin\")\n", - "plt.text(0.03, 0.97, \"Pandora PF particles\", transform=ax.transAxes, va=\"top\", ha=\"left\")\n", "plt.legend()\n", + "plt.text(0.03, 0.97, \"dashed - stable generator particles\\nsolid - reconstructed Pandora PF particles\", transform=ax.transAxes, va=\"top\", ha=\"left\", fontsize=16)\n", "plt.ylim(0,500*num_files)\n", "plt.ticklabel_format(axis=\"y\", style=\"sci\", scilimits=(0,0))\n", - "plt.savefig(\"plots/clic/pf_particle_pt.pdf\")" + "plt.savefig(\"plots/clic/gen_cand_particle_pt.pdf\")" ] }, { diff --git a/notebooks/paper_plots_2023_ml_training.ipynb b/notebooks/paper_plots_2023_ml_training.ipynb new file mode 100644 index 000000000..0989c597c --- /dev/null +++ b/notebooks/paper_plots_2023_ml_training.ipynb @@ -0,0 +1,700 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import numpy as np\n", + "import json\n", + "from uncertainties import ufloat\n", + "import glob\n", + "import pandas\n", + "import json\n", + "\n", + "import mplhep\n", + "mplhep.style.use(mplhep.style.CMS)\n", + "\n", + "import sys\n", + "sys.path.append(\"../mlpf/\")\n", + "from plotting.plot_utils import pid_to_text, format_dataset_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -f plots_mlpf_clic_2023" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_combined_array(histories, key):\n", + " combined_array = np.array(histories[0][key])\n", + " for ii in range(1, len(histories)):\n", + " combined_array = np.vstack([combined_array, np.array(histories[ii][key])])\n", + " return combined_array\n", + "\n", + "\n", + "def get_full_history(hist_dir, verbose=False):\n", + " jsons = list(hist_dir.glob(\"history*.json\"))\n", + " if verbose:\n", + " print(f\"{hist_dir.parent} has {len(jsons)} hisotries\")\n", + " if len(jsons) == 0:\n", + " return {}, 0\n", + " jsons.sort(key=lambda x: int(x.name.split(\"_\")[1].split(\".\")[0])) # sort according to epoch number\n", + "\n", + " # initialize a dict with correct keys and empty lists as values\n", + " with open(jsons[0]) as h:\n", + " keys = json.load(h).keys()\n", + " full_history = {key: [] for key in keys}\n", + "\n", + " # join epoch values to a full history\n", + " for path in jsons:\n", + " with open(path) as h:\n", + " epoch = json.load(h)\n", + " for key in epoch.keys():\n", + " full_history[key].append(epoch[key])\n", + "\n", + " reg_loss = np.sum(\n", + " np.array([full_history[\"{}_loss\".format(l)] for l in [\"energy\", \"pt\", \"eta\", \"sin_phi\", \"cos_phi\", \"charge\"]]),\n", + " axis=0,\n", + " )\n", + " val_reg_loss = np.sum(\n", + " np.array(\n", + " [full_history[\"val_{}_loss\".format(l)] for l in [\"energy\", \"pt\", \"eta\", \"sin_phi\", \"cos_phi\", \"charge\"]]\n", + " ),\n", + " axis=0,\n", + " )\n", + " full_history.update({\"reg_loss\": reg_loss})\n", + " full_history.update({\"val_reg_loss\": val_reg_loss})\n", + "\n", + " return full_history, len(jsons)\n", + "\n", + "\n", + "def get_histories(train_dirs):\n", + " train_dirs = [Path(train_dir) for train_dir in train_dirs]\n", + " histories = []\n", + "\n", + " for train_dir in train_dirs:\n", + " hist, N = get_full_history(hist_dir=train_dir / \"logs/history\")\n", + " if N > 0:\n", + " histories.append(hist)\n", + "\n", + " return histories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "histories_gnn_before = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning/clic_gnn_beforeHPO/*\")))\n", + "histories_gnn_after = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning//clic_gnn_afterHPO/*\")))\n", + "\n", + "histories_tf_before = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning//clic_transformer_beforeHPO/*\")))\n", + "histories_tf_after = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments-archive/hypertuning//clic_transformer_afterHPO/*\")))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ret = {\n", + " \"gnn\": {\n", + " \"before\": {\n", + " \"val_loss\": get_combined_array(histories_gnn_before,\"val_loss\"),\n", + " \"jet_iqr\": get_combined_array(histories_gnn_before,\"val_jet_iqr\"),\n", + " \"met_iqr\": get_combined_array(histories_gnn_before,\"val_met_iqr\"),\n", + " },\n", + " \"after\": {\n", + " \"val_loss\": get_combined_array(histories_gnn_after,\"val_loss\"),\n", + " \"jet_iqr\": get_combined_array(histories_gnn_after,\"val_jet_iqr\"),\n", + " \"met_iqr\": get_combined_array(histories_gnn_after,\"val_met_iqr\"),\n", + " }\n", + " },\n", + " \"transformer\": {\n", + " \"before\": {\n", + " \"val_loss\": get_combined_array(histories_tf_before,\"val_loss\"),\n", + " \"jet_iqr\": get_combined_array(histories_tf_before,\"val_jet_iqr\"),\n", + " \"met_iqr\": get_combined_array(histories_tf_before,\"val_met_iqr\"),\n", + " },\n", + " \"after\": {\n", + " \"val_loss\": get_combined_array(histories_tf_after,\"val_loss\"),\n", + " \"jet_iqr\": get_combined_array(histories_tf_after,\"val_jet_iqr\"),\n", + " \"met_iqr\": get_combined_array(histories_tf_after,\"val_met_iqr\"),\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def sigdigits(mean, std):\n", + " return \"{:L}\".format(ufloat(mean, std))\n", + "\n", + "\n", + "def run_label(x=0.67, y=0.90, fz=12):\n", + " plt.figtext(x, y, r'tt+qq', wrap=False, horizontalalignment='right', fontsize=fz)\n", + "\n", + "\n", + "def cms_label(x0=0.12, y=0.90, s=None, fz=22):\n", + " # plt.figtext(x0, y,'CMS',fontweight='bold', wrap=True, horizontalalignment='left', fontsize=fz)\n", + " # plt.figtext(x0+0.09, y,'Simulation Preliminary', style='italic', wrap=True, horizontalalignment='left', fontsize=fz-3)\n", + " if s is not None:\n", + " t = plt.figtext(x=x0, y=y-0.15, s=s[:-1], fontsize=fz-6)\n", + "\n", + "\n", + "def plot_variance_curve(array_list,\n", + " labels,\n", + " colors_styles,\n", + " skip=0,\n", + " ylim=None,\n", + " save_path=None,\n", + " x=0.45,\n", + " y=0.53,\n", + " loc=None,\n", + " ylabel=None,\n", + " custom_info=None,\n", + " threshold=None\n", + " ):\n", + " \n", + " fig = plt.figure()\n", + " final_means = []\n", + " final_stds = []\n", + " for ii, array in enumerate(array_list):\n", + " print(f\"{labels[ii]} is averaged over {array.shape[0]} trainings.\")\n", + " xx = np.array(range(array.shape[1])) + 1 # Epochs\n", + "\n", + " xx = xx[skip:]\n", + " array = array[:, skip:]\n", + "\n", + " std = np.std(array, axis=0)\n", + " mean = np.mean(array, axis=0)\n", + "\n", + " col, sty = colors_styles[ii]\n", + " plt.plot(xx, mean, label=labels[ii], color=col, ls=sty)\n", + " plt.fill_between(xx, mean - std, mean + std, alpha=0.4, facecolor=col)\n", + "\n", + " # Add individual loss curves\n", + " # plt.plot(np.tile(xx, reps=[10,1]).transpose(), array.transpose(), linewidth=0.2)\n", + "\n", + " print(labels[ii] + \": {:s}\".format(sigdigits(mean[-1], std[-1])))\n", + " final_means.append(mean[-1])\n", + " final_stds.append(std[-1])\n", + "\n", + " if threshold:\n", + " plt.axhline(threshold, ls=\"--\", color=\"black\", label=\"baseline PF\") \n", + " \n", + "# plt.legend(bbox_to_anchor=(0.98, 0.78), loc=\"center right\")\n", + " if loc is not None:\n", + " plt.legend(loc=loc)\n", + " else:\n", + " plt.legend()\n", + " plt.xlabel(\"Epochs\")\n", + " if ylabel:\n", + " plt.ylabel(ylabel)\n", + "\n", + " s=\"Mean and stddev of {:d} trainings\\n\".format(array.shape[0])\n", + " for ii, label in enumerate(labels):\n", + " if custom_info:\n", + " s += \"Final {}:${:s}$\\n\".format(label, sigdigits(custom_info[ii]['mean'], custom_info[ii][\"std\"]))\n", + " else:\n", + " s += \"Final {}:${:s}$\\n\".format(label, sigdigits(final_means[ii], final_stds[ii]))\n", + "\n", + " if ylim:\n", + " plt.ylim(top=ylim[1], bottom=ylim[0])\n", + "\n", + " plt.subplots_adjust(left=0.14)\n", + " \n", + " cms_label(x0=x, y=y, s=s, fz=24)\n", + " run_label(x=0.9, y=0.89, fz=22)\n", + " if save_path:\n", + " plt.savefig(Path(save_path).with_suffix('.png'))\n", + " plt.savefig(Path(save_path).with_suffix('.pdf'))\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Figure 4: hypertuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_variance_curve([ret[\"gnn\"][\"before\"][\"val_loss\"], ret[\"gnn\"][\"after\"][\"val_loss\"],\n", + " ret[\"transformer\"][\"before\"][\"val_loss\"], ret[\"transformer\"][\"after\"][\"val_loss\"]],\n", + " [\"GNN\", \"GNN-HPO\",\"TF\", \"TF-HPO\"],\n", + " [(\"red\", \"--\"), (\"red\", \"-\"), (\"blue\", \"--\"), (\"blue\", \"-\")],\n", + " skip=1,\n", + " ylim=[0, 20],\n", + " save_path=\"plots_mlpf_clic_2023/loss.png\",\n", + " x=0.25,\n", + " y=0.85,\n", + " ylabel=\"Total validation loss (a.u.)\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_variance_curve([ret[\"gnn\"][\"before\"][\"jet_iqr\"], ret[\"gnn\"][\"after\"][\"jet_iqr\"],\n", + " ret[\"transformer\"][\"before\"][\"jet_iqr\"], ret[\"transformer\"][\"after\"][\"jet_iqr\"]],\n", + " [\"GNN\", \"GNN-HPO\",\"TF\", \"TF-HPO\"],\n", + " [(\"red\", \"--\"), (\"red\", \"-\"), (\"blue\", \"--\"), (\"blue\", \"-\")],\n", + " skip=1,\n", + " save_path=\"plots_mlpf_clic_2023/jet_iqr.png\",\n", + " x=0.25,\n", + " y=0.85,\n", + " ylim=(0, 0.3),\n", + " ylabel=r\"jet response IQR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_variance_curve([ret[\"gnn\"][\"before\"][\"met_iqr\"], ret[\"gnn\"][\"after\"][\"met_iqr\"],\n", + " ret[\"transformer\"][\"before\"][\"met_iqr\"], ret[\"transformer\"][\"after\"][\"met_iqr\"]],\n", + " [\"GNN\", \"GNN-HPO\",\"TF\", \"TF-HPO\"],\n", + " [(\"red\", \"--\"), (\"red\", \"-\"), (\"blue\", \"--\"), (\"blue\", \"-\")],\n", + " skip=1,\n", + " save_path=\"plots_mlpf_clic_2023/met_iqr.png\",\n", + " x=0.25,\n", + " y=0.85,\n", + " ylim=(0, 2),\n", + " ylabel=r\"MET response IQR\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Figure 5: scaling of timing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "timing_data_gpu_1 = open(\"/home/joosep/particleflow/experiments-archive/timing/mlpf-gnn/gpu_timing_1.txt\").read()\n", + "timing_data_gpu_2 = open(\"/home/joosep/particleflow/experiments-archive/timing/mlpf-gnn/gpu_timing_2.txt\").read()\n", + "timing_data_gpu_3 = open(\"/home/joosep/particleflow/experiments-archive/timing/mlpf-gnn/gpu_timing_2.txt\").read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batches = []\n", + "nptcls = []\n", + "ts = []\n", + "\n", + "for line in timing_data_gpu_1.strip().split(\"\\n\") + timing_data_gpu_2.strip().split(\"\\n\") + timing_data_gpu_3.strip().split(\"\\n\"):\n", + " batch, nptcl, t = line.split()\n", + " batches.append(int(batch))\n", + " nptcls.append(int(nptcl))\n", + " ts.append(float(t))\n", + " \n", + "df = pandas.DataFrame()\n", + "df[\"batch\"] = batches\n", + "df[\"nptcl\"] = nptcls\n", + "df[\"t\"] = ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_agg = df.groupby(['batch', 'nptcl'], as_index=False).agg({'t':['mean','std']})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sf = df_agg[(df_agg[\"batch\"]==16) & (df_agg[\"nptcl\"]==256)][\"t\"][\"mean\"].values[0]/16\n", + "\n", + "plt.plot([256,20*256], [1,20], color=\"black\", ls=\"--\", lw=2, label=\"linear scaling\")\n", + "\n", + "markers = [\"o\", \"^\", \"v\", \"s\", \".\"]\n", + "for batch, elem in df_agg.groupby(\"batch\"):\n", + " m = markers.pop(0)\n", + " plt.errorbar(\n", + " elem[\"nptcl\"],\n", + " elem[\"t\"][\"mean\"]/sf/batch,\n", + " elem[\"t\"][\"std\"]/sf/batch,\n", + " label=\"B={}\".format(batch),\n", + " marker=m)\n", + "plt.legend(loc=\"best\")\n", + "\n", + "plt.legend(loc=\"best\")\n", + "plt.ylabel(\"relative time per event\\nT(N,B) / T(256,16)\")\n", + "plt.xlabel(\"number of input elements per event, N\")\n", + "plt.title(\"MLPF-GNN on 8GB GPU\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/mlpf_gnn.png\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/mlpf_gnn.pdf\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#cd experiments-archive/timing/pandora\n", + "#grep TIMER gun_np* | grep MyDDMarlinPandora\n", + "\n", + "timing_data_cpu = \"\"\"\n", + "gun_np_100_1.txt:TIMER.TIMER INFO MyDDMarlinPandora | 13836.000 | 13836.583 | 11041.204 18211.2 2452.37 | 10 | 138.366 |\n", + "gun_np_100_2.txt:TIMER.TIMER INFO MyDDMarlinPandora | 13863.000 | 13861.665 | 11451.365 17552.7 1725.47 | 10 | 138.617 |\n", + "gun_np_100_3.txt:TIMER.TIMER INFO MyDDMarlinPandora | 12829.000 | 12828.634 | 8948.882 15546.3 2064.03 | 10 | 128.286 |\n", + "gun_np_100_4.txt:TIMER.TIMER INFO MyDDMarlinPandora | 14907.000 | 14908.701 | 10741.859 18102.9 2524.65 | 10 | 149.087 |\n", + "gun_np_100_5.txt:TIMER.TIMER INFO MyDDMarlinPandora | 13174.000 | 13173.000 | 9291.507 23493.5 4326.01 | 10 | 131.730 |\n", + "gun_np_100_6.txt:TIMER.TIMER INFO MyDDMarlinPandora | 12383.000 | 12383.906 | 10438.694 14086.3 1404.91 | 10 | 123.839 |\n", + "gun_np_100_7.txt:TIMER.TIMER INFO MyDDMarlinPandora | 12956.000 | 12955.911 | 10072.747 16893.5 2782.78 | 10 | 129.559 |\n", + "gun_np_200_1.txt:TIMER.TIMER INFO MyDDMarlinPandora | 47395.000 | 47397.637 | 35578.270 55634.0 5259.41 | 10 | 473.976 |\n", + "gun_np_200_2.txt:TIMER.TIMER INFO MyDDMarlinPandora | 49098.000 | 49099.168 | 43919.848 55204.7 3801.20 | 10 | 490.992 |\n", + "gun_np_200_3.txt:TIMER.TIMER INFO MyDDMarlinPandora | 47285.000 | 47283.594 | 34430.031 52283.8 5227.59 | 10 | 472.836 |\n", + "gun_np_200_4.txt:TIMER.TIMER INFO MyDDMarlinPandora | 45921.000 | 45919.754 | 31380.205 56530.5 8713.73 | 10 | 459.198 |\n", + "gun_np_200_5.txt:TIMER.TIMER INFO MyDDMarlinPandora | 46047.000 | 46047.980 | 37939.055 57653.4 5632.91 | 10 | 460.480 |\n", + "gun_np_200_6.txt:TIMER.TIMER INFO MyDDMarlinPandora | 46928.000 | 46930.746 | 36946.914 62604.0 8045.00 | 10 | 469.307 |\n", + "gun_np_200_7.txt:TIMER.TIMER INFO MyDDMarlinPandora | 44988.000 | 44989.551 | 39393.648 48307.4 2795.09 | 10 | 449.896 |\n", + "gun_np_25_1.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1278.000 | 1275.958 | 890.380 1750.9 258.73 | 10 | 12.760 |\n", + "gun_np_25_2.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1611.000 | 1611.889 | 1061.231 2250.4 371.55 | 10 | 16.119 |\n", + "gun_np_25_3.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1511.000 | 1513.813 | 821.452 2323.8 449.85 | 10 | 15.138 |\n", + "gun_np_25_4.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1391.000 | 1393.518 | 884.898 2606.6 475.22 | 10 | 13.935 |\n", + "gun_np_25_5.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1458.000 | 1457.076 | 843.642 2644.6 584.55 | 10 | 14.571 |\n", + "gun_np_25_6.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1705.000 | 1706.138 | 906.869 3667.7 810.81 | 10 | 17.061 |\n", + "gun_np_25_7.txt:TIMER.TIMER INFO MyDDMarlinPandora | 1598.000 | 1598.356 | 1074.817 1955.7 286.26 | 10 | 15.984 |\n", + "gun_np_50_1.txt:TIMER.TIMER INFO MyDDMarlinPandora | 3962.000 | 3962.514 | 2568.144 6292.2 1138.24 | 10 | 39.625 |\n", + "gun_np_50_2.txt:TIMER.TIMER INFO MyDDMarlinPandora | 3771.000 | 3771.321 | 3111.184 4891.2 606.17 | 10 | 37.713 |\n", + "gun_np_50_3.txt:TIMER.TIMER INFO MyDDMarlinPandora | 4266.000 | 4266.345 | 3128.854 5726.2 918.44 | 10 | 42.663 |\n", + "gun_np_50_4.txt:TIMER.TIMER INFO MyDDMarlinPandora | 4008.000 | 4007.004 | 3067.614 6363.1 935.73 | 10 | 40.070 |\n", + "gun_np_50_5.txt:TIMER.TIMER INFO MyDDMarlinPandora | 3833.000 | 3834.250 | 2535.937 4735.0 781.68 | 10 | 38.343 |\n", + "gun_np_50_6.txt:TIMER.TIMER INFO MyDDMarlinPandora | 3658.000 | 3662.497 | 2667.986 5553.0 1050.26 | 10 | 36.625 |\n", + "gun_np_50_7.txt:TIMER.TIMER INFO MyDDMarlinPandora | 3844.000 | 3845.911 | 2562.266 6607.6 1196.24 | 10 | 38.459 |\n", + "\"\"\"\n", + "\n", + "timing_data = {}\n", + "for line in timing_data_cpu.strip().split(\"\\n\"):\n", + " lspl = line.split()\n", + " nptcl = int(lspl[0].split(\":\")[0].split(\"_\")[2])\n", + " dt = float(lspl[4])\n", + " if not (nptcl in timing_data):\n", + " timing_data[nptcl] = []\n", + " timing_data[nptcl].append(dt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "means = []\n", + "stds = []\n", + "xs = []\n", + "for k in sorted(timing_data.keys()):\n", + " means.append(np.mean(timing_data[k]))\n", + " stds.append(np.std(timing_data[k]))\n", + " xs.append(k)\n", + "xs = np.array(xs)\n", + "means = np.array(means)\n", + "stds = np.array(stds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.errorbar(xs, means/means[0], stds/means[0], marker=\"o\", label=\"baseline PF\")\n", + "plt.plot([25,200], [1,8], color=\"black\", label=\"linear scaling\", ls=\"--\")\n", + "plt.legend()\n", + "plt.ylabel(\"relative time per event, $T(N)/T(25)$\")\n", + "plt.xlabel(\"number of $\\pi^-$ particles per event, $N$\")\n", + "#plt.xlim(0,100)\n", + "#plt.ylim(0,10)\n", + "plt.title(\"Baseline PF on CPU\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/baseline_pf.png\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/baseline_pf.pdf\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gpu_scaling_x = np.array([1,2,4,8])\n", + "# gpu_scaling_y = np.array([443.72, 209.913, 92.476, 34.263])\n", + "# gpu_scaling_y /= gpu_scaling_y[0]\n", + "# plt.plot(gpu_scaling_x, 1.0/gpu_scaling_y, lw=0, marker=\"o\")\n", + "# plt.plot([1,8],[1,8], color=\"black\", ls=\"--\", label=\"linear scaling\")\n", + "# plt.xlabel(\"Number of GPUs\")\n", + "# plt.ylabel(\"Training epoch throughput\\nincrease over 1 GPU\")\n", + "# plt.title(\"Scaling test on LUMI HPC: MI250X\")\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Figure 6: mixed precision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hist_fp32 = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments/clic_fp32_bs1_*\")))\n", + "hist_bf16 = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments/clic_bf16_bs1_*\")))\n", + "hist_bf16_bs2 = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments/clic_bf16_bs2_*\")))\n", + "hist_fp16 = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments/clic_fp16_bs1_*\")))\n", + "hist_fp16_bs2 = get_histories(list(glob.glob(\"/home/joosep/particleflow/experiments/clic_fp16_bs2_*\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_losses(hist, label, color, ls, marker, scaler=1.0):\n", + " vals = np.stack([hist[i][\"loss\"] for i in range(len(hist))])\n", + " m = np.mean(vals, axis=0)\n", + " s = np.std(vals, axis=0)\n", + " xs = np.arange(len(m))\n", + " plt.errorbar(xs+1, m/scaler, s/scaler, label=label, marker=marker, color=color, ls=ls)\n", + " \n", + "def plot_time(hists, colors):\n", + " ms = []\n", + " ss = []\n", + " for hist in hists:\n", + " vals = np.stack([hist[i][\"time\"] for i in range(len(hist))])\n", + " dt = vals[:, -1] - vals[:, 0]\n", + " m = np.mean(dt, axis=0)\n", + " s = np.std(dt, axis=0)\n", + " ms.append(m)\n", + " ss.append(s)\n", + " ms = np.array(ms)\n", + " ss = np.array(ss)\n", + " ms0 = ms[0]\n", + " ms /= ms0\n", + " ss /= ms0\n", + " plt.bar(range(len(ms)), ms, color=colors)\n", + " plt.errorbar(range(len(ms)), ms, ss, linewidth=0, elinewidth=2.0, color=\"black\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loss_scaler = np.mean([hist_fp32[i][\"loss\"][-1] for i in range(len(hist_fp32))])\n", + "\n", + "plot_losses(hist_fp32, \"FP32\", \"green\", \"-\", \"o\", loss_scaler)\n", + "plot_losses(hist_bf16, \"BF16\", \"blue\", \"-\", \"^\", loss_scaler)\n", + "plot_losses(hist_bf16_bs2, \"BF16, Bx2\", \"blue\", \"--\", \"^\", loss_scaler)\n", + "#plot_losses(hist_fp16, \"FP16\", \"red\", \"-\", \"v\", loss_scaler)\n", + "#plot_losses(hist_fp16_bs2, \"FP16, Bx2\", \"red\", \"--\", \"v\", loss_scaler)\n", + "\n", + "#plt.yscale(\"log\")\n", + "plt.axhline(1.0, color=\"black\", ls=\"--\")\n", + "plt.ylabel(\"Relative loss wrt. FP32 @ epoch 10\")\n", + "plt.xlabel(\"Training epoch\")\n", + "plt.ylim(0.5,1.5)\n", + "plt.xlim(2,10)\n", + "plt.legend()\n", + "plt.savefig(\"plots_mlpf_clic_2023/mixed_precision_loss_scaling.png\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/mixed_precision_loss_scaling.pdf\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_time([hist_fp32, hist_fp16, hist_fp16_bs2, hist_bf16, hist_bf16_bs2], [\"green\", \"red\", \"red\", \"blue\", \"blue\"])\n", + "plt.xticks([0,1,2,3,4], [\"FP32\", \"FP16\", \"FP16, Bx2\", \"BF16\", \"BF16, Bx2\"])\n", + "plt.axvline(2.5, color=\"black\", ls=\"--\")\n", + "plt.text(2.75, 2.15, \"BF16 ops not yet fully\\nsupported on GPU by TF\", fontsize=16)\n", + "plt.ylim(0,2.5)\n", + "plt.ylabel(\"Relative training time wrt. FP32\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/mixed_precision_timing.png\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/mixed_precision_timing.pdf\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hists = sorted(glob.glob(\"/home/joosep/particleflow/experiments-archive/hits/clic-hits-ln_*/logs/history/history_*.json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# loss = []\n", + "# val_loss = []\n", + "\n", + "# for hist in hists:\n", + "# val_loss.append(json.load(open(hist))[\"val_loss\"])\n", + "# loss.append(json.load(open(hist))[\"loss\"])\n", + "\n", + "# loss = np.array(loss)\n", + "# val_loss = np.array(val_loss)\n", + "\n", + "# plt.plot(loss, label=\"train\", marker=\"o\")\n", + "# plt.plot(val_loss, label=\"val\", marker=\"o\")\n", + "# plt.legend(title=format_dataset_name(\"clic_edm_ttbar_pf\"))\n", + "# plt.ylim(0.0,0.3)\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jet_iqr_vals = []\n", + "met_iqr_vals = []\n", + "\n", + "for hist in hists:\n", + " jet_iqr_vals.append(json.load(open(hist))[\"val_jet_iqr\"])\n", + " met_iqr_vals.append(json.load(open(hist))[\"val_met_iqr\"])\n", + "\n", + "jet_iqr_vals = np.array(jet_iqr_vals)\n", + "met_iqr_vals = np.array(met_iqr_vals)\n", + "\n", + "plt.plot(jet_iqr_vals, label=\"jet response IQR\", marker=\"o\")\n", + "plt.plot(met_iqr_vals/5, label=\"MET response IQR / 5\", marker=\"o\")\n", + "plt.legend(title=format_dataset_name(\"clic_edm_ttbar_pf\"))\n", + "plt.ylim(0,0.2)\n", + "plt.xlim(0, 12)\n", + "plt.ylabel(\"Response IQR (a.u.)\")\n", + "plt.xlabel(\"Training epoch\")\n", + "#plt.title(\"Training on tracks and calorimeter hits\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/hitbased_res_iqr.png\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/hitbased_res_iqr.pdf\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jet_med_vals = []\n", + "met_med_vals = []\n", + "\n", + "for hist in hists:\n", + " jet_med_vals.append(json.load(open(hist))[\"val_jet_med\"])\n", + " met_med_vals.append(json.load(open(hist))[\"val_met_med\"])\n", + "\n", + "jet_med_vals = np.array(jet_med_vals)\n", + "met_med_vals = np.array(met_med_vals)\n", + "\n", + "plt.plot(jet_med_vals, label=\"jet response median\", marker=\"o\")\n", + "plt.plot(met_med_vals, label=\"MET response median\", marker=\"o\")\n", + "plt.legend(title=format_dataset_name(\"clic_edm_ttbar_pf\"))\n", + "plt.axhline(1.0, color=\"black\", ls=\"--\")\n", + "plt.ylim(0.8,1.2)\n", + "plt.xlim(0, 12)\n", + "\n", + "plt.ylabel(\"Response median (a.u.)\")\n", + "plt.xlabel(\"Training epoch\")\n", + "#plt.title(\"Training on tracks and calorimeter hits\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/hitbased_res_med.png\")\n", + "plt.savefig(\"plots_mlpf_clic_2023/hitbased_res_med.pdf\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/parameters/clic-hits-ln.yaml b/parameters/clic-hits-ln.yaml new file mode 100644 index 000000000..c515b7993 --- /dev/null +++ b/parameters/clic-hits-ln.yaml @@ -0,0 +1,285 @@ +backend: tensorflow + +dataset: + schema: clic + target_particles: gen + num_input_features: 15 + #(none=0, track=1, hit=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + +loss: + classification_loss_coef: 100.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.0001 + num_epochs: 20 + dtype: float32 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + normalizer_cache: parameters/clic_hits_normalizations + +batching: + # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu + bucket_by_sequence_length: yes + # these sizes were sort of tuned for an 8GB GPU + # - max_sequence_length, batch_size_per_gpu + + bucket_batch_sizes: auto + # use this batch multiplier to increase all batch sizes by a constant factor + batch_multiplier: 1 + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 512 + + combined_graph_layer: + bin_size: 256 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.2 + dist_activation: elu + ffn_dist_num_layers: 2 + ffn_dist_hidden_dim: 128 + + # MPNN + #kernel: + # type: NodePairTrainableKernel + # activation: elu + #num_node_messages: 1 + #node_message: + # type: NodeMessageLearnable + # output_dim: 64 + # hidden_dim: 128 + # num_layers: 2 + # activation: elu + #activation: elu + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 512 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.2 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + pt_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 256 + charge_hidden_dim: 256 + pt_hidden_dim: 256 + eta_hidden_dim: 256 + phi_hidden_dim: 256 + energy_hidden_dim: 256 + + id_num_layers: 2 + charge_num_layers: 2 + pt_num_layers: 2 + eta_num_layers: 2 + phi_num_layers: 2 + energy_num_layers: 2 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 1 + datasets: + - clic_edm_ttbar_hits_pf + - clic_edm_qq_hits_pf + gun: + batch_per_gpu: 5 + datasets: + - clic_edm_single_kaon0l_hits_pf + - clic_edm_single_pi_hits_pf + - clic_edm_single_pi0_hits_pf + - clic_edm_single_neutron_hits_pf + - clic_edm_single_electron_hits_pf + - clic_edm_single_muon_hits_pf + +validation_dataset: clic_edm_ttbar_hits_pf +validation_batch_size: 20 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_ttbar_hits_pf: + batch_size: 10 + num_events: 10000 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_qq_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_kaon0l_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_gamma_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_pi_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_pi0_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_neutron_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_electron_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_muon_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: diff --git a/parameters/clic-hits.yaml b/parameters/clic-hits.yaml index 4dfe43f0c..88ca88362 100644 --- a/parameters/clic-hits.yaml +++ b/parameters/clic-hits.yaml @@ -3,7 +3,7 @@ backend: tensorflow dataset: schema: clic target_particles: gen - num_input_features: 18 + num_input_features: 15 #(none=0, track=1, hit=2) num_input_classes: 3 #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) @@ -19,8 +19,12 @@ loss: sin_phi_loss_coef: 10.0 cos_phi_loss_coef: 10.0 energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 charge_loss: - type: BinaryCrossentropy + type: CategoricalCrossentropy from_logits: yes energy_loss: type: Huber @@ -44,37 +48,21 @@ setup: train: yes weights: weights_config: - lr: 0.001 + lr: 0.0005 num_epochs: 20 dtype: float32 trainable: - classification_loss_type: sigmoid_focal_crossentropy lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd horovod_enabled: no cls_output_as_logits: yes small_graph_opt: no + normalizer_cache: parameters/clic_hits_normalizations batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: no - # these sizes were sort of tuned for an 8GB GPU - # - max_sequence_length, batch_size_per_gpu - -#on 8GB GPU - bucket_batch_sizes: - - 25, 200 - - 50, 100 - - 100, 50 - - 200, 20 - - 500, 10 - - 1000, 5 - - 2000, 3 - - 3000, 2 - - 4000, 2 - - 5000, 1 - - 6000, 1 - - inf, 1 + bucket_by_sequence_length: yes + bucket_batch_sizes: auto # use this batch multiplier to increase all batch sizes by a constant factor batch_multiplier: 1 @@ -105,7 +93,7 @@ parameters: input_encoding: clic node_update_mode: additive do_node_encoding: yes - node_encoding_hidden_dim: 256 + node_encoding_hidden_dim: 512 combined_graph_layer: bin_size: 256 @@ -114,8 +102,8 @@ parameters: layernorm: yes dropout: 0.0 dist_activation: elu - ffn_dist_num_layers: 3 - ffn_dist_hidden_dim: 64 + ffn_dist_num_layers: 2 + ffn_dist_hidden_dim: 128 # MPNN #kernel: @@ -139,7 +127,7 @@ parameters: num_node_messages: 2 node_message: type: GHConvDense - output_dim: 256 + output_dim: 512 activation: elu #if this is enabled, it will break float16 training normalize_degrees: no @@ -161,19 +149,19 @@ parameters: phi_dim_decrease: yes energy_dim_decrease: yes - id_hidden_dim: 128 - charge_hidden_dim: 128 - pt_hidden_dim: 128 - eta_hidden_dim: 128 - phi_hidden_dim: 128 - energy_hidden_dim: 128 + id_hidden_dim: 256 + charge_hidden_dim: 256 + pt_hidden_dim: 256 + eta_hidden_dim: 256 + phi_hidden_dim: 256 + energy_hidden_dim: 256 - id_num_layers: 1 - charge_num_layers: 1 - pt_num_layers: 1 - eta_num_layers: 1 - phi_num_layers: 1 - energy_num_layers: 1 + id_num_layers: 2 + charge_num_layers: 2 + pt_num_layers: 2 + eta_num_layers: 2 + phi_num_layers: 2 + energy_num_layers: 2 layernorm: yes mask_reg_cls0: yes @@ -234,9 +222,18 @@ train_test_datasets: datasets: - clic_edm_ttbar_hits_pf - clic_edm_qq_hits_pf + gun: + batch_per_gpu: 5 + datasets: + - clic_edm_single_kaon0l_hits_pf + - clic_edm_single_pi_hits_pf + - clic_edm_single_pi0_hits_pf + - clic_edm_single_neutron_hits_pf + - clic_edm_single_electron_hits_pf + - clic_edm_single_muon_hits_pf validation_dataset: clic_edm_ttbar_hits_pf -validation_batch_size: 10 +validation_batch_size: 20 validation_num_events: 2000 evaluation_datasets: @@ -248,10 +245,38 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_hits_pf: - version: 1.0.0 + version: 1.2.0 data_dir: manual_dir: clic_edm_qq_hits_pf: - version: 1.0.0 + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_kaon0l_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_gamma_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_pi_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_pi0_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_neutron_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_electron_hits_pf: + version: 1.2.0 + data_dir: + manual_dir: + clic_edm_single_muon_hits_pf: + version: 1.2.0 data_dir: manual_dir: diff --git a/parameters/clic.yaml b/parameters/clic.yaml index 9fab7e31f..58f0ecd2f 100644 --- a/parameters/clic.yaml +++ b/parameters/clic.yaml @@ -19,8 +19,12 @@ loss: sin_phi_loss_coef: 10.0 cos_phi_loss_coef: 10.0 energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 charge_loss: - type: BinaryCrossentropy + type: CategoricalCrossentropy from_logits: yes energy_loss: type: Huber @@ -48,12 +52,12 @@ setup: num_epochs: 100 dtype: float32 trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd horovod_enabled: no cls_output_as_logits: yes - small_graph_opt: yes + small_graph_opt: no + normalizer_cache: parameters/clic_normalizations batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu @@ -260,22 +264,22 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_ttbar_pu10_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_qq_pf: - version: 1.3.1 + version: 1.4.0 data_dir: manual_dir: clic_edm_ww_fullhad_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_zh_tautau_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml index c27e2f4ec..382e30a34 100644 --- a/parameters/cms-gen.yaml +++ b/parameters/cms-gen.yaml @@ -31,8 +31,12 @@ loss: sin_phi_loss_coef: 1.0 cos_phi_loss_coef: 1.0 energy_loss_coef: 1.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 charge_loss: - type: BinaryCrossentropy + type: CategoricalCrossentropy from_logits: yes energy_loss: type: MeanSquaredLogarithmicError @@ -60,16 +64,17 @@ setup: weights: weights_config: lr: 0.0001 - num_events_validation: 500 num_epochs: 50 dtype: float32 trainable: - classification_loss_type: sigmoid_focal_crossentropy lr_schedule: none # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd horovod_enabled: no cls_output_as_logits: yes - small_graph_opt: yes + #if enabled, do not create LSH bins for small graphs (less than one bin size) + #enabling results in some speedup for gun samples, but must be disabled for XLA + small_graph_opt: no + normalizer_cache: parameters/cms_normalizations batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml index 919710900..0ebf8e673 100644 --- a/parameters/delphes.yaml +++ b/parameters/delphes.yaml @@ -19,8 +19,12 @@ loss: sin_phi_loss_coef: 1.0 cos_phi_loss_coef: 1.0 energy_loss_coef: 1.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 charge_loss: - type: BinaryCrossentropy + type: CategoricalCrossentropy from_logits: yes energy_loss: type: Huber @@ -50,19 +54,15 @@ setup: weights: weights_config: lr: 1e-5 - num_events_train: 45000 - num_events_test: 5000 - num_events_validation: 10 num_epochs: 50 - num_val_files: 5 dtype: float32 trainable: - classification_loss_type: categorical_cross_entropy lr_schedule: exponentialdecay # exponentialdecay, onecycle optimizer: adam # adam, adamw, sgd horovod_enabled: False - cls_output_as_logits: False + cls_output_as_logits: yes small_graph_opt: no + normalizer_cache: parameters/delphes_normalizations batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu diff --git a/parameters/clic-finetune.yaml b/parameters/mixedprecision/clic_bf16_bs1.yaml similarity index 72% rename from parameters/clic-finetune.yaml rename to parameters/mixedprecision/clic_bf16_bs1.yaml index 2efe15121..de6563abb 100644 --- a/parameters/clic-finetune.yaml +++ b/parameters/mixedprecision/clic_bf16_bs1.yaml @@ -13,14 +13,18 @@ dataset: loss: classification_loss_coef: 200.0 - charge_loss_coef: 0.0001 - pt_loss_coef: 0.0001 - eta_loss_coef: 0.0001 - sin_phi_loss_coef: 0.0001 - cos_phi_loss_coef: 0.0001 - energy_loss_coef: 0.0001 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 charge_loss: - type: BinaryCrossentropy + type: CategoricalCrossentropy from_logits: yes energy_loss: type: Huber @@ -42,41 +46,21 @@ tensorflow: setup: train: yes - weights: experiments/clic_20230412_155159_717751.gpu1.local/weights/weights-100-9.948204.hdf5 + weights: weights_config: lr: 0.0005 - num_epochs: 200 - dtype: float32 + num_epochs: 100 + dtype: bfloat16 trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd horovod_enabled: no cls_output_as_logits: yes - small_graph_opt: yes + small_graph_opt: no + normalizer_cache: parameters/clic_normalizations batching: - # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: yes - # these sizes were sort of tuned for an 8GB GPU - # - max_sequence_length, batch_size_per_gpu - -#on 8GB GPU - bucket_batch_sizes: - - 25, 200 - - 50, 100 - - 100, 50 - - 200, 20 - - 500, 10 - - 1000, 5 - - 2000, 3 - - 3000, 2 - - 4000, 2 - - 5000, 1 - - 6000, 1 - - inf, 1 - # use this batch multiplier to increase all batch sizes by a constant factor - batch_multiplier: 1 + bucket_by_sequence_length: no optimizer: adam: @@ -108,14 +92,14 @@ parameters: node_encoding_hidden_dim: 256 combined_graph_layer: - bin_size: 640 + bin_size: 256 max_num_bins: 200 distance_dim: 128 layernorm: yes dropout: 0.0 dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 # MPNN #kernel: @@ -145,8 +129,8 @@ parameters: normalize_degrees: no activation: elu - num_graph_layers_id: 5 - num_graph_layers_reg: 5 + num_graph_layers_id: 6 + num_graph_layers_reg: 6 output_decoding: activation: elu regression_use_classification: yes @@ -161,19 +145,19 @@ parameters: phi_dim_decrease: yes energy_dim_decrease: yes - id_hidden_dim: 512 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 - id_num_layers: 3 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 layernorm: yes mask_reg_cls0: yes @@ -230,11 +214,11 @@ raytune: train_test_datasets: physical: - batch_per_gpu: 1 + batch_per_gpu: 20 datasets: - clic_edm_ttbar_pf - - clic_edm_qq_pf +do_validation_callback: false validation_dataset: clic_edm_ttbar_pf validation_batch_size: 100 validation_num_events: 2000 @@ -260,22 +244,22 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_ttbar_pu10_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_qq_pf: - version: 1.3.1 + version: 1.4.0 data_dir: manual_dir: clic_edm_ww_fullhad_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_zh_tautau_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: diff --git a/parameters/mixedprecision/clic_bf16_bs2.yaml b/parameters/mixedprecision/clic_bf16_bs2.yaml new file mode 100644 index 000000000..81024750b --- /dev/null +++ b/parameters/mixedprecision/clic_bf16_bs2.yaml @@ -0,0 +1,265 @@ +backend: tensorflow + +dataset: + schema: clic + target_particles: gen + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + +loss: + classification_loss_coef: 200.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.001 + num_epochs: 100 + dtype: bfloat16 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + normalizer_cache: parameters/clic_normalizations + +batching: + bucket_by_sequence_length: no + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 + + combined_graph_layer: + bin_size: 256 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.0 + dist_activation: elu + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 + + # MPNN + #kernel: + # type: NodePairTrainableKernel + # activation: elu + #num_node_messages: 1 + #node_message: + # type: NodeMessageLearnable + # output_dim: 64 + # hidden_dim: 128 + # num_layers: 2 + # activation: elu + #activation: elu + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 256 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.1 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + pt_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 40 + datasets: + - clic_edm_ttbar_pf + +do_validation_callback: false +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_pf: + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_qq_pf: + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_ww_fullhad_pf: + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_zh_tautau_pf: + version: 1.4.0 + data_dir: + manual_dir: diff --git a/parameters/test-eventloss/met.yaml b/parameters/mixedprecision/clic_fp16_bs1.yaml similarity index 56% rename from parameters/test-eventloss/met.yaml rename to parameters/mixedprecision/clic_fp16_bs1.yaml index 92142f65c..2fbee5413 100644 --- a/parameters/test-eventloss/met.yaml +++ b/parameters/mixedprecision/clic_fp16_bs1.yaml @@ -1,56 +1,44 @@ backend: tensorflow dataset: - schema: cms + schema: clic target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7, tau=8) - num_output_classes: 9 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 cls_weight_by_pt: no + reg_weight_by_pt: no loss: - classification_loss_coef: 1.0 + classification_loss_coef: 200.0 charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes energy_loss: type: Huber pt_loss: type: Huber sin_phi_loss: type: Huber - delta: 0.1 cos_phi_loss: type: Huber - delta: 0.1 eta_loss: type: Huber - delta: 0.1 - event_loss: none + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d event_loss_coef: 0.0 - met_loss: - type: Huber - delta: 10.0 + met_loss: none met_loss_coef: 1.0 tensorflow: @@ -61,20 +49,22 @@ setup: weights: weights_config: lr: 0.0005 - num_events_validation: 200 - num_epochs: 50 - dtype: float32 + num_epochs: 100 + dtype: float16 trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # exponentialdecay, onecycle, none + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd - horovod_enabled: False + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + normalizer_cache: parameters/clic_normalizations + +batching: + bucket_by_sequence_length: no optimizer: adam: amsgrad: no - #pcgrad does not work with LossScaleOptimizer, so it must be disabled for float16 - pcgrad: yes adamw: amsgrad: yes weight_decay: 0.001 @@ -96,20 +86,20 @@ onecycle: parameters: model: gnn_dense - input_encoding: cms - node_update_mode: concat - do_node_encoding: no - node_encoding_hidden_dim: 128 + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 combined_graph_layer: - bin_size: 100 + bin_size: 256 max_num_bins: 200 - distance_dim: 64 + distance_dim: 128 layernorm: yes dropout: 0.0 dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 # MPNN #kernel: @@ -133,18 +123,20 @@ parameters: num_node_messages: 2 node_message: type: GHConvDense - output_dim: 128 + output_dim: 256 activation: elu #if this is enabled, it will break float16 training - normalize_degrees: yes + normalize_degrees: no activation: elu - num_graph_layers_id: 2 - num_graph_layers_reg: 2 + num_graph_layers_id: 6 + num_graph_layers_reg: 6 output_decoding: activation: elu regression_use_classification: yes - dropout: 0.0 + dropout: 0.1 + + pt_as_correction: no id_dim_decrease: yes charge_dim_decrease: yes @@ -153,23 +145,23 @@ parameters: phi_dim_decrease: yes energy_dim_decrease: yes - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 layernorm: yes - mask_reg_cls0: no + mask_reg_cls0: yes - skip_connection: yes + skip_connection: no debug: no timing: @@ -222,30 +214,52 @@ raytune: train_test_datasets: physical: - batch_per_gpu: 5 + batch_per_gpu: 20 datasets: - - cms_pf_ttbar - - cms_pf_ztt - - cms_pf_qcd - - cms_pf_qcd_high_pt + - clic_edm_ttbar_pf -validation_datasets: - - cms_pf_qcd_high_pt +do_validation_callback: false +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm datasets: - cms_pf_ttbar: + clic_edm_ttbar_pf: + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: version: 1.4.0 data_dir: manual_dir: - cms_pf_ztt: + clic_edm_qq_pf: version: 1.4.0 data_dir: manual_dir: - cms_pf_qcd: + clic_edm_ww_fullhad_pf: version: 1.4.0 data_dir: manual_dir: - cms_pf_qcd_high_pt: + clic_edm_zh_tautau_pf: version: 1.4.0 data_dir: manual_dir: diff --git a/parameters/test-eventloss/baseline.yaml b/parameters/mixedprecision/clic_fp16_bs2.yaml similarity index 56% rename from parameters/test-eventloss/baseline.yaml rename to parameters/mixedprecision/clic_fp16_bs2.yaml index 5edbebb65..31d8fd7fa 100644 --- a/parameters/test-eventloss/baseline.yaml +++ b/parameters/mixedprecision/clic_fp16_bs2.yaml @@ -1,55 +1,45 @@ backend: tensorflow dataset: - schema: cms + schema: clic target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7, tau=8) - num_output_classes: 9 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 cls_weight_by_pt: no + reg_weight_by_pt: no loss: - classification_loss_coef: 1.0 + classification_loss_coef: 200.0 charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes energy_loss: type: Huber pt_loss: type: Huber sin_phi_loss: type: Huber - delta: 0.1 cos_phi_loss: type: Huber - delta: 0.1 eta_loss: type: Huber - delta: 0.1 - event_loss: none + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d event_loss_coef: 0.0 met_loss: none - met_loss_coef: 0.0 + met_loss_coef: 1.0 tensorflow: eager: no @@ -58,21 +48,23 @@ setup: train: yes weights: weights_config: - lr: 0.0005 - num_events_validation: 200 - num_epochs: 50 - dtype: float32 + lr: 0.001 + num_epochs: 100 + dtype: float16 trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # exponentialdecay, onecycle, none + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd - horovod_enabled: False + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + normalizer_cache: parameters/clic_normalizations + +batching: + bucket_by_sequence_length: no optimizer: adam: amsgrad: no - #pcgrad does not work with LossScaleOptimizer, so it must be disabled for float16 - pcgrad: yes adamw: amsgrad: yes weight_decay: 0.001 @@ -94,20 +86,20 @@ onecycle: parameters: model: gnn_dense - input_encoding: cms - node_update_mode: concat - do_node_encoding: no - node_encoding_hidden_dim: 128 + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 combined_graph_layer: - bin_size: 100 + bin_size: 256 max_num_bins: 200 - distance_dim: 64 + distance_dim: 128 layernorm: yes dropout: 0.0 dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 # MPNN #kernel: @@ -131,18 +123,20 @@ parameters: num_node_messages: 2 node_message: type: GHConvDense - output_dim: 128 + output_dim: 256 activation: elu #if this is enabled, it will break float16 training - normalize_degrees: yes + normalize_degrees: no activation: elu - num_graph_layers_id: 2 - num_graph_layers_reg: 2 + num_graph_layers_id: 6 + num_graph_layers_reg: 6 output_decoding: activation: elu regression_use_classification: yes - dropout: 0.0 + dropout: 0.1 + + pt_as_correction: no id_dim_decrease: yes charge_dim_decrease: yes @@ -151,23 +145,23 @@ parameters: phi_dim_decrease: yes energy_dim_decrease: yes - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 layernorm: yes - mask_reg_cls0: no + mask_reg_cls0: yes - skip_connection: yes + skip_connection: no debug: no timing: @@ -220,30 +214,52 @@ raytune: train_test_datasets: physical: - batch_per_gpu: 5 + batch_per_gpu: 40 datasets: - - cms_pf_ttbar - - cms_pf_ztt - - cms_pf_qcd - - cms_pf_qcd_high_pt + - clic_edm_ttbar_pf + +do_validation_callback: false +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 -validation_datasets: - - cms_pf_qcd_high_pt +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm datasets: - cms_pf_ttbar: + clic_edm_ttbar_pf: + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: version: 1.4.0 data_dir: manual_dir: - cms_pf_ztt: + clic_edm_qq_pf: version: 1.4.0 data_dir: manual_dir: - cms_pf_qcd: + clic_edm_ww_fullhad_pf: version: 1.4.0 data_dir: manual_dir: - cms_pf_qcd_high_pt: + clic_edm_zh_tautau_pf: version: 1.4.0 data_dir: manual_dir: diff --git a/parameters/clic-fp16.yaml b/parameters/mixedprecision/clic_fp32_bs1.yaml similarity index 77% rename from parameters/clic-fp16.yaml rename to parameters/mixedprecision/clic_fp32_bs1.yaml index 66ae4250f..53622e9ef 100644 --- a/parameters/clic-fp16.yaml +++ b/parameters/mixedprecision/clic_fp32_bs1.yaml @@ -12,15 +12,19 @@ dataset: reg_weight_by_pt: no loss: - classification_loss_coef: 100.0 + classification_loss_coef: 200.0 charge_loss_coef: 1.0 pt_loss_coef: 10.0 eta_loss_coef: 10.0 sin_phi_loss_coef: 10.0 cos_phi_loss_coef: 10.0 energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 charge_loss: - type: BinaryCrossentropy + type: CategoricalCrossentropy from_logits: yes energy_loss: type: Huber @@ -44,39 +48,20 @@ setup: train: yes weights: weights_config: - lr: 0.0002 - num_epochs: 200 - dtype: float16 + lr: 0.0005 + num_epochs: 100 + dtype: float32 trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none optimizer: adam # adam, adamw, sgd horovod_enabled: no cls_output_as_logits: yes - small_graph_opt: yes + small_graph_opt: no + normalizer_cache: parameters/clic_normalizations batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: yes - # these sizes were sort of tuned for an 8GB GPU - # - max_sequence_length, batch_size_per_gpu - -#on 8GB GPU - bucket_batch_sizes: - - 25, 200 - - 50, 100 - - 100, 50 - - 200, 20 - - 500, 10 - - 1000, 5 - - 2000, 3 - - 3000, 2 - - 4000, 2 - - 5000, 1 - - 6000, 1 - - inf, 1 - # use this batch multiplier to increase all batch sizes by a constant factor - batch_multiplier: 1 + bucket_by_sequence_length: no optimizer: adam: @@ -108,14 +93,14 @@ parameters: node_encoding_hidden_dim: 256 combined_graph_layer: - bin_size: 640 + bin_size: 256 max_num_bins: 200 distance_dim: 128 layernorm: yes dropout: 0.0 dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 # MPNN #kernel: @@ -145,8 +130,8 @@ parameters: normalize_degrees: no activation: elu - num_graph_layers_id: 5 - num_graph_layers_reg: 5 + num_graph_layers_id: 6 + num_graph_layers_reg: 6 output_decoding: activation: elu regression_use_classification: yes @@ -161,19 +146,19 @@ parameters: phi_dim_decrease: yes energy_dim_decrease: yes - id_hidden_dim: 512 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 - id_num_layers: 3 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 layernorm: yes mask_reg_cls0: yes @@ -230,11 +215,11 @@ raytune: train_test_datasets: physical: - batch_per_gpu: 1 + batch_per_gpu: 20 datasets: - clic_edm_ttbar_pf - - clic_edm_qq_pf +do_validation_callback: false validation_dataset: clic_edm_ttbar_pf validation_batch_size: 100 validation_num_events: 2000 @@ -246,6 +231,9 @@ evaluation_datasets: clic_edm_ttbar_pf: batch_size: 50 num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 clic_edm_zh_tautau_pf: batch_size: 50 num_events: -1 @@ -257,18 +245,22 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_pf: - version: 1.3.0 + version: 1.4.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.4.0 data_dir: manual_dir: clic_edm_qq_pf: - version: 1.3.1 + version: 1.4.0 data_dir: manual_dir: clic_edm_ww_fullhad_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: clic_edm_zh_tautau_pf: - version: 1.3.0 + version: 1.4.0 data_dir: manual_dir: diff --git a/parameters/test-eventloss/genjet_logcosh.yaml b/parameters/test-eventloss/genjet_logcosh.yaml deleted file mode 100644 index 5ba5adb13..000000000 --- a/parameters/test-eventloss/genjet_logcosh.yaml +++ /dev/null @@ -1,249 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7, tau=8) - num_output_classes: 9 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - cls_weight_by_pt: no - -loss: - classification_loss_coef: 1.0 - charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - event_loss: gen_jet_logcosh - event_loss_coef: 1.0 - met_loss: none - met_loss_coef: 0.0 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 0.0005 - num_events_validation: 200 - num_epochs: 50 - dtype: float32 - trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # exponentialdecay, onecycle, none - optimizer: adam # adam, adamw, sgd - horovod_enabled: False - -optimizer: - adam: - amsgrad: no - #pcgrad does not work with LossScaleOptimizer, so it must be disabled for float16 - pcgrad: yes - adamw: - amsgrad: yes - weight_decay: 0.001 - sgd: - nesterov: no - momentum: 0.9 - -# LR Schedules -exponentialdecay: - decay_steps: 2000 - decay_rate: 0.99 - staircase: yes -onecycle: - mom_min: 0.85 - mom_max: 0.95 - warmup_ratio: 0.3 - div_factor: 25.0 - final_div: 100000.0 - -parameters: - model: gnn_dense - input_encoding: cms - node_update_mode: concat - do_node_encoding: no - node_encoding_hidden_dim: 128 - - combined_graph_layer: - bin_size: 100 - max_num_bins: 200 - distance_dim: 64 - layernorm: yes - dropout: 0.0 - dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 - - # MPNN - #kernel: - # type: NodePairTrainableKernel - # activation: elu - #num_node_messages: 1 - #node_message: - # type: NodeMessageLearnable - # output_dim: 64 - # hidden_dim: 128 - # num_layers: 2 - # activation: elu - #activation: elu - - # GCN - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - dist_norm: l2 - num_node_messages: 2 - node_message: - type: GHConvDense - output_dim: 128 - activation: elu - #if this is enabled, it will break float16 training - normalize_degrees: yes - activation: elu - - num_graph_layers_id: 2 - num_graph_layers_reg: 2 - output_decoding: - activation: elu - regression_use_classification: yes - dropout: 0.0 - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: yes - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -callbacks: - checkpoint: - monitor: "val_loss" - plot_freq: 1 - tensorboard: - dump_history: yes - hist_freq: 1 - -hypertune: - algorithm: hyperband # random, bayesian, hyperband - random: - objective: val_loss - max_trials: 100 - bayesian: - objective: val_loss - max_trials: 100 - num_initial_points: 2 - hyperband: - objective: val_loss - max_epochs: 10 - factor: 3 - iterations: 1 - executions_per_trial: 1 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_test_datasets: - physical: - batch_per_gpu: 5 - datasets: - - cms_pf_ttbar - - cms_pf_ztt - - cms_pf_qcd - - cms_pf_qcd_high_pt - -validation_datasets: - - cms_pf_qcd_high_pt - -datasets: - cms_pf_ttbar: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_ztt: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_qcd: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_qcd_high_pt: - version: 1.4.0 - data_dir: - manual_dir: diff --git a/parameters/test-eventloss/h2d.yaml b/parameters/test-eventloss/h2d.yaml deleted file mode 100644 index 71a0e9487..000000000 --- a/parameters/test-eventloss/h2d.yaml +++ /dev/null @@ -1,249 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7, tau=8) - num_output_classes: 9 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - cls_weight_by_pt: no - -loss: - classification_loss_coef: 1.0 - charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - event_loss: hist_2d - event_loss_coef: 1.0 - met_loss: none - met_loss_coef: 0.0 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 0.0005 - num_events_validation: 200 - num_epochs: 50 - dtype: float32 - trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # exponentialdecay, onecycle, none - optimizer: adam # adam, adamw, sgd - horovod_enabled: False - -optimizer: - adam: - amsgrad: no - #pcgrad does not work with LossScaleOptimizer, so it must be disabled for float16 - pcgrad: yes - adamw: - amsgrad: yes - weight_decay: 0.001 - sgd: - nesterov: no - momentum: 0.9 - -# LR Schedules -exponentialdecay: - decay_steps: 2000 - decay_rate: 0.99 - staircase: yes -onecycle: - mom_min: 0.85 - mom_max: 0.95 - warmup_ratio: 0.3 - div_factor: 25.0 - final_div: 100000.0 - -parameters: - model: gnn_dense - input_encoding: cms - node_update_mode: concat - do_node_encoding: no - node_encoding_hidden_dim: 128 - - combined_graph_layer: - bin_size: 100 - max_num_bins: 200 - distance_dim: 64 - layernorm: yes - dropout: 0.0 - dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 - - # MPNN - #kernel: - # type: NodePairTrainableKernel - # activation: elu - #num_node_messages: 1 - #node_message: - # type: NodeMessageLearnable - # output_dim: 64 - # hidden_dim: 128 - # num_layers: 2 - # activation: elu - #activation: elu - - # GCN - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - dist_norm: l2 - num_node_messages: 2 - node_message: - type: GHConvDense - output_dim: 128 - activation: elu - #if this is enabled, it will break float16 training - normalize_degrees: yes - activation: elu - - num_graph_layers_id: 2 - num_graph_layers_reg: 2 - output_decoding: - activation: elu - regression_use_classification: yes - dropout: 0.0 - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: yes - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -callbacks: - checkpoint: - monitor: "val_loss" - plot_freq: 1 - tensorboard: - dump_history: yes - hist_freq: 1 - -hypertune: - algorithm: hyperband # random, bayesian, hyperband - random: - objective: val_loss - max_trials: 100 - bayesian: - objective: val_loss - max_trials: 100 - num_initial_points: 2 - hyperband: - objective: val_loss - max_epochs: 10 - factor: 3 - iterations: 1 - executions_per_trial: 1 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_test_datasets: - physical: - batch_per_gpu: 5 - datasets: - - cms_pf_ttbar - - cms_pf_ztt - - cms_pf_qcd - - cms_pf_qcd_high_pt - -validation_datasets: - - cms_pf_qcd_high_pt - -datasets: - cms_pf_ttbar: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_ztt: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_qcd: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_qcd_high_pt: - version: 1.4.0 - data_dir: - manual_dir: diff --git a/parameters/test-eventloss/swd.yaml b/parameters/test-eventloss/swd.yaml deleted file mode 100644 index 8be1dc3d4..000000000 --- a/parameters/test-eventloss/swd.yaml +++ /dev/null @@ -1,249 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7, tau=8) - num_output_classes: 9 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - cls_weight_by_pt: no - -loss: - classification_loss_coef: 1.0 - charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - event_loss: sliced_wasserstein - event_loss_coef: 1.0 - met_loss: none - met_loss_coef: 0.0 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 0.0005 - num_events_validation: 200 - num_epochs: 50 - dtype: float32 - trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # exponentialdecay, onecycle, none - optimizer: adam # adam, adamw, sgd - horovod_enabled: False - -optimizer: - adam: - amsgrad: no - #pcgrad does not work with LossScaleOptimizer, so it must be disabled for float16 - pcgrad: yes - adamw: - amsgrad: yes - weight_decay: 0.001 - sgd: - nesterov: no - momentum: 0.9 - -# LR Schedules -exponentialdecay: - decay_steps: 2000 - decay_rate: 0.99 - staircase: yes -onecycle: - mom_min: 0.85 - mom_max: 0.95 - warmup_ratio: 0.3 - div_factor: 25.0 - final_div: 100000.0 - -parameters: - model: gnn_dense - input_encoding: cms - node_update_mode: concat - do_node_encoding: no - node_encoding_hidden_dim: 128 - - combined_graph_layer: - bin_size: 100 - max_num_bins: 200 - distance_dim: 64 - layernorm: yes - dropout: 0.0 - dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 - - # MPNN - #kernel: - # type: NodePairTrainableKernel - # activation: elu - #num_node_messages: 1 - #node_message: - # type: NodeMessageLearnable - # output_dim: 64 - # hidden_dim: 128 - # num_layers: 2 - # activation: elu - #activation: elu - - # GCN - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - dist_norm: l2 - num_node_messages: 2 - node_message: - type: GHConvDense - output_dim: 128 - activation: elu - #if this is enabled, it will break float16 training - normalize_degrees: yes - activation: elu - - num_graph_layers_id: 2 - num_graph_layers_reg: 2 - output_decoding: - activation: elu - regression_use_classification: yes - dropout: 0.0 - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: yes - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -callbacks: - checkpoint: - monitor: "val_loss" - plot_freq: 1 - tensorboard: - dump_history: yes - hist_freq: 1 - -hypertune: - algorithm: hyperband # random, bayesian, hyperband - random: - objective: val_loss - max_trials: 100 - bayesian: - objective: val_loss - max_trials: 100 - num_initial_points: 2 - hyperband: - objective: val_loss - max_epochs: 10 - factor: 3 - iterations: 1 - executions_per_trial: 1 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_test_datasets: - physical: - batch_per_gpu: 5 - datasets: - - cms_pf_ttbar - - cms_pf_ztt - - cms_pf_qcd - - cms_pf_qcd_high_pt - -validation_datasets: - - cms_pf_qcd_high_pt - -datasets: - cms_pf_ttbar: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_ztt: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_qcd: - version: 1.4.0 - data_dir: - manual_dir: - cms_pf_qcd_high_pt: - version: 1.4.0 - data_dir: - manual_dir: diff --git a/parameters/test-gnn/cms-0l.yaml b/parameters/test-gnn/cms-0l.yaml deleted file mode 100644 index 5977abbc6..000000000 --- a/parameters/test-gnn/cms-0l.yaml +++ /dev/null @@ -1,149 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: cand - num_input_features: 15 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - classification_loss_coef: 1.0 - charge_loss_coef: 0.01 - pt_loss_coef: 0.0001 - eta_loss_coef: 100.0 - sin_phi_loss_coef: 10.0 - cos_phi_loss_coef: 10.0 - energy_loss_coef: 0.0001 - raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl* - processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords - num_files_per_chunk: 1 - validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl* - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 1e-3 - batch_size: 20 - num_events_train: 1000 - num_events_test: 1000 - num_epochs: 50 - num_val_files: 20 - dtype: float32 - trainable: classification - classification_loss_type: categorical_cross_entropy - lr_schedule: exponentialdecay # exponentialdecay, onecycle - -sample_weights: - cls: inverse_sqrt - charge: signal_only - pt: signal_only - eta: signal_only - sin_phi: signal_only - cos_phi: signal_only - energy: signal_only - -parameters: - model: gnn_dense - input_encoding: cms - do_node_encoding: no - hidden_dim: 128 - dropout: 0.0 - activation: gelu - combined_graph_layer: - do_lsh: no - bin_size: 160 - max_num_bins: 100 - distance_dim: 128 - layernorm: no - dropout: 0.0 - dist_activation: gelu - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - num_node_messages: 1 - node_message: - type: GHConvDense - output_dim: 128 - activation: gelu - normalize_degrees: yes - hidden_dim: 128 - activation: gelu - num_graph_layers_common: 0 - num_graph_layers_energy: 0 - output_decoding: - activation: gelu - regression_use_classification: yes - dropout: 0.0 - - pt_skip_gate: no - eta_skip_gate: yes - phi_skip_gate: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: no - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -exponentialdecay: - decay_steps: 1000 - decay_rate: 0.98 - staircase: yes diff --git a/parameters/test-gnn/cms-lsh-1l.yaml b/parameters/test-gnn/cms-lsh-1l.yaml deleted file mode 100644 index c8c4dfb7e..000000000 --- a/parameters/test-gnn/cms-lsh-1l.yaml +++ /dev/null @@ -1,149 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: cand - num_input_features: 15 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - classification_loss_coef: 1.0 - charge_loss_coef: 0.01 - pt_loss_coef: 0.0001 - eta_loss_coef: 100.0 - sin_phi_loss_coef: 10.0 - cos_phi_loss_coef: 10.0 - energy_loss_coef: 0.0001 - raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl* - processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords - num_files_per_chunk: 1 - validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl* - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 1e-3 - batch_size: 10 - num_events_train: 1000 - num_events_test: 1000 - num_epochs: 50 - num_val_files: 20 - dtype: float32 - trainable: classification - classification_loss_type: categorical_cross_entropy - lr_schedule: exponentialdecay # exponentialdecay, onecycle - -sample_weights: - cls: inverse_sqrt - charge: signal_only - pt: signal_only - eta: signal_only - sin_phi: signal_only - cos_phi: signal_only - energy: signal_only - -parameters: - model: gnn_dense - input_encoding: cms - do_node_encoding: no - hidden_dim: 128 - dropout: 0.0 - activation: gelu - combined_graph_layer: - do_lsh: yes - bin_size: 160 - max_num_bins: 100 - distance_dim: 128 - layernorm: no - dropout: 0.0 - dist_activation: gelu - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - num_node_messages: 1 - node_message: - type: GHConvDense - output_dim: 128 - activation: gelu - normalize_degrees: yes - hidden_dim: 128 - activation: gelu - num_graph_layers_common: 1 - num_graph_layers_energy: 1 - output_decoding: - activation: gelu - regression_use_classification: yes - dropout: 0.0 - - pt_skip_gate: no - eta_skip_gate: yes - phi_skip_gate: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: no - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -exponentialdecay: - decay_steps: 1000 - decay_rate: 0.98 - staircase: yes diff --git a/parameters/test-gnn/cms-lsh-2l.yaml b/parameters/test-gnn/cms-lsh-2l.yaml deleted file mode 100644 index 5eb0a83f2..000000000 --- a/parameters/test-gnn/cms-lsh-2l.yaml +++ /dev/null @@ -1,149 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: cand - num_input_features: 15 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - classification_loss_coef: 1.0 - charge_loss_coef: 0.01 - pt_loss_coef: 0.0001 - eta_loss_coef: 100.0 - sin_phi_loss_coef: 10.0 - cos_phi_loss_coef: 10.0 - energy_loss_coef: 0.0001 - raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl* - processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords - num_files_per_chunk: 1 - validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl* - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 1e-3 - batch_size: 5 - num_events_train: 1000 - num_events_test: 1000 - num_epochs: 50 - num_val_files: 20 - dtype: float32 - trainable: classification - classification_loss_type: categorical_cross_entropy - lr_schedule: exponentialdecay # exponentialdecay, onecycle - -sample_weights: - cls: inverse_sqrt - charge: signal_only - pt: signal_only - eta: signal_only - sin_phi: signal_only - cos_phi: signal_only - energy: signal_only - -parameters: - model: gnn_dense - input_encoding: cms - do_node_encoding: no - hidden_dim: 128 - dropout: 0.0 - activation: gelu - combined_graph_layer: - do_lsh: yes - bin_size: 160 - max_num_bins: 100 - distance_dim: 128 - layernorm: no - dropout: 0.0 - dist_activation: gelu - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - num_node_messages: 1 - node_message: - type: GHConvDense - output_dim: 128 - activation: gelu - normalize_degrees: yes - hidden_dim: 128 - activation: gelu - num_graph_layers_common: 2 - num_graph_layers_energy: 2 - output_decoding: - activation: gelu - regression_use_classification: yes - dropout: 0.0 - - pt_skip_gate: no - eta_skip_gate: yes - phi_skip_gate: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: no - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -exponentialdecay: - decay_steps: 1000 - decay_rate: 0.98 - staircase: yes diff --git a/parameters/test-gnn/cms-lsh-3l.yaml b/parameters/test-gnn/cms-lsh-3l.yaml deleted file mode 100644 index 6ac8b76c7..000000000 --- a/parameters/test-gnn/cms-lsh-3l.yaml +++ /dev/null @@ -1,149 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: cand - num_input_features: 15 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - classification_loss_coef: 1.0 - charge_loss_coef: 0.01 - pt_loss_coef: 0.0001 - eta_loss_coef: 100.0 - sin_phi_loss_coef: 10.0 - cos_phi_loss_coef: 10.0 - energy_loss_coef: 0.0001 - raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl* - processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords - num_files_per_chunk: 1 - validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl* - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 1e-3 - batch_size: 5 - num_events_train: 1000 - num_events_test: 1000 - num_epochs: 50 - num_val_files: 20 - dtype: float32 - trainable: classification - classification_loss_type: categorical_cross_entropy - lr_schedule: exponentialdecay # exponentialdecay, onecycle - -sample_weights: - cls: inverse_sqrt - charge: signal_only - pt: signal_only - eta: signal_only - sin_phi: signal_only - cos_phi: signal_only - energy: signal_only - -parameters: - model: gnn_dense - input_encoding: cms - do_node_encoding: no - hidden_dim: 128 - dropout: 0.0 - activation: gelu - combined_graph_layer: - do_lsh: yes - bin_size: 160 - max_num_bins: 100 - distance_dim: 128 - layernorm: no - dropout: 0.0 - dist_activation: gelu - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - num_node_messages: 1 - node_message: - type: GHConvDense - output_dim: 128 - activation: gelu - normalize_degrees: yes - hidden_dim: 128 - activation: gelu - num_graph_layers_common: 3 - num_graph_layers_energy: 3 - output_decoding: - activation: gelu - regression_use_classification: yes - dropout: 0.0 - - pt_skip_gate: no - eta_skip_gate: yes - phi_skip_gate: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: no - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -exponentialdecay: - decay_steps: 1000 - decay_rate: 0.98 - staircase: yes diff --git a/parameters/test-gnn/cms-lsh-mpnn.yaml b/parameters/test-gnn/cms-lsh-mpnn.yaml deleted file mode 100644 index 291cd98a5..000000000 --- a/parameters/test-gnn/cms-lsh-mpnn.yaml +++ /dev/null @@ -1,153 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: cand - num_input_features: 15 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - classification_loss_coef: 1.0 - charge_loss_coef: 0.01 - pt_loss_coef: 0.0001 - eta_loss_coef: 100.0 - sin_phi_loss_coef: 10.0 - cos_phi_loss_coef: 10.0 - energy_loss_coef: 0.0001 - raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl* - processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords - num_files_per_chunk: 1 - validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl* - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 1e-3 - batch_size: 4 - num_events_train: 1000 - num_events_test: 1000 - num_epochs: 50 - num_val_files: 20 - dtype: float32 - trainable: classification - classification_loss_type: categorical_cross_entropy - lr_schedule: exponentialdecay # exponentialdecay, onecycle - -sample_weights: - cls: inverse_sqrt - charge: signal_only - pt: signal_only - eta: signal_only - sin_phi: signal_only - cos_phi: signal_only - energy: signal_only - -parameters: - model: gnn_dense - input_encoding: cms - do_node_encoding: no - hidden_dim: 128 - dropout: 0.0 - activation: gelu - combined_graph_layer: - do_lsh: yes - bin_size: 32 - max_num_bins: 500 - distance_dim: 128 - layernorm: no - dropout: 0.0 - dist_activation: linear - kernel: - type: NodePairTrainableKernel - output_dim: 8 - hidden_dim: 32 - num_layers: 2 - activation: gelu - node_message: - type: NodeMessageLearnable - output_dim: 256 - hidden_dim: 128 - num_layers: 2 - activation: gelu - aggregation_direction: src - num_node_messages: 1 - hidden_dim: 256 - activation: gelu - num_graph_layers_common: 2 - num_graph_layers_energy: 2 - output_decoding: - activation: gelu - regression_use_classification: yes - dropout: 0.0 - - pt_skip_gate: no - eta_skip_gate: yes - phi_skip_gate: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: no - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -exponentialdecay: - decay_steps: 1000 - decay_rate: 0.98 - staircase: yes diff --git a/parameters/test-gnn/cms-nolsh-1l.yaml b/parameters/test-gnn/cms-nolsh-1l.yaml deleted file mode 100644 index 697aac9ed..000000000 --- a/parameters/test-gnn/cms-nolsh-1l.yaml +++ /dev/null @@ -1,149 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: cand - num_input_features: 15 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - classification_loss_coef: 1.0 - charge_loss_coef: 0.01 - pt_loss_coef: 0.0001 - eta_loss_coef: 100.0 - sin_phi_loss_coef: 10.0 - cos_phi_loss_coef: 10.0 - energy_loss_coef: 0.0001 - raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl* - processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords - num_files_per_chunk: 1 - validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl* - energy_loss: - type: Huber - pt_loss: - type: Huber - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 1e-3 - batch_size: 2 - num_events_train: 1000 - num_events_test: 1000 - num_epochs: 50 - num_val_files: 20 - dtype: float32 - trainable: classification - classification_loss_type: categorical_cross_entropy - lr_schedule: exponentialdecay # exponentialdecay, onecycle - -sample_weights: - cls: inverse_sqrt - charge: signal_only - pt: signal_only - eta: signal_only - sin_phi: signal_only - cos_phi: signal_only - energy: signal_only - -parameters: - model: gnn_dense - input_encoding: cms - do_node_encoding: no - hidden_dim: 128 - dropout: 0.0 - activation: gelu - combined_graph_layer: - do_lsh: no - bin_size: 160 - max_num_bins: 100 - distance_dim: 128 - layernorm: no - dropout: 0.0 - dist_activation: gelu - kernel: - type: NodePairGaussianKernel - dist_mult: 0.1 - clip_value_low: 0.0 - num_node_messages: 1 - node_message: - type: GHConvDense - output_dim: 128 - activation: gelu - normalize_degrees: yes - hidden_dim: 128 - activation: gelu - num_graph_layers_common: 1 - num_graph_layers_energy: 1 - output_decoding: - activation: gelu - regression_use_classification: yes - dropout: 0.0 - - pt_skip_gate: no - eta_skip_gate: yes - phi_skip_gate: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 256 - charge_hidden_dim: 256 - pt_hidden_dim: 256 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 256 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: no - mask_reg_cls0: no - - skip_connection: yes - debug: no - -timing: - num_ev: 100 - num_iter: 3 - -exponentialdecay: - decay_steps: 1000 - decay_rate: 0.98 - staircase: yes diff --git a/requirements.txt b/requirements.txt index 739c0fc2f..b3d2f1257 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ pandas papermill plotly pre-commit +protobuf==3.20.3 pyarrow ray[default] ray[tune] @@ -29,11 +30,9 @@ scipy seaborn setGPU tensorflow -tensorflow-addons tensorflow-datasets==4.8.0 tensorflow-estimator tensorflow-probability -tensorflow-text tf-models-official tf2onnx tqdm diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index cdd747a74..b642fbe28 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -30,7 +30,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # wait # CLIC cluster-based -# export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep_2023_02_27 +# export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep_2023_05_09/ # $CMD mlpf/heptfds/clic_pf_edm4hep/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq.log & # $CMD mlpf/heptfds/clic_pf_edm4hep/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & # $CMD mlpf/heptfds/clic_pf_edm4hep/zh --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_zh.log & @@ -39,9 +39,16 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # wait # CLIC hit-based -# export MANUAL_DIR=/local/joosep/mlpf_hits/clic_edm4hep_2023_02_27 -# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq.log & -# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & +# export MANUAL_DIR=/local/joosep/mlpf_hits/clic_edm4hep/ +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_kaon0L_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_ele --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_ele_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0 --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_pi0_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_pi --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_pi_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_neutron_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_gamma_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_mu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_mu_hits.log & # wait # Delphes diff --git a/scripts/local_test_clic_hits_pipeline.sh b/scripts/local_test_clic_hits_pipeline.sh new file mode 100755 index 000000000..48bb2f9e1 --- /dev/null +++ b/scripts/local_test_clic_hits_pipeline.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e +export TFDS_DATA_DIR=`pwd`/tensorflow_datasets +export PYTHONPATH=`pwd`/mlpf:$PYTHONPATH + +rm -Rf data/p8_ee_tt_ecm380 +mkdir -p data/p8_ee_tt_ecm380 +cd data/p8_ee_tt_ecm380 + +#download some test data +wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/clic_edm4hep_2023_02_27/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_1.root +wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/clic_edm4hep_2023_02_27/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_2.root + +cd ../.. + +python3 fcc/postprocessing_hits.py data/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_1.root data/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_1.parquet +python3 fcc/postprocessing_hits.py data/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_2.root data/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_2.parquet + +tfds build mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --manual_dir data + +# #Train, evaluate and make plots +python mlpf/pipeline.py train --config parameters/clic-hits.yaml --nepochs 1 --customize pipeline_test --ntrain 2 --ntest 2 +python mlpf/pipeline.py evaluate --nevents 10 --customize pipeline_test --train-dir ./experiments/clic* --weights ./experiments/clic*/weights/weights-01-*.hdf5 +python mlpf/pipeline.py plots --train-dir ./experiments/clic* diff --git a/scripts/local_test_clic_pipeline.sh b/scripts/local_test_clic_pipeline.sh index f6e25eb7d..5e825e9b5 100755 --- a/scripts/local_test_clic_pipeline.sh +++ b/scripts/local_test_clic_pipeline.sh @@ -19,9 +19,9 @@ python3 fcc/postprocessing.py data/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_2.root d tfds build mlpf/heptfds/clic_pf_edm4hep/ttbar --manual_dir data # #Train, evaluate and make plots -python mlpf/pipeline.py train --config parameters/clic.yaml --nepochs 1 --customize pipeline_test -python mlpf/pipeline.py evaluate --nevents 100 --customize pipeline_test --train-dir ./experiments/clic* --weights ./experiments/clic*/weights/weights-01-*.hdf5 +python mlpf/pipeline.py train --config parameters/clic.yaml --nepochs 1 --customize pipeline_test --ntrain 10 --ntest 10 +python mlpf/pipeline.py evaluate --nevents 10 --customize pipeline_test --train-dir ./experiments/clic* --weights ./experiments/clic*/weights/weights-01-*.hdf5 python mlpf/pipeline.py plots --train-dir ./experiments/clic* #try to train a fp16 model -python mlpf/pipeline.py train --config parameters/clic-fp16.yaml --nepochs 1 --customize pipeline_test +python mlpf/pipeline.py train --config parameters/mixedprecision/clic_fp16_bs1.yaml --nepochs 1 --customize pipeline_test --ntrain 10 --ntest 10