From d335cd3f345be0fa4b1872edd7e9d5d85ab81812 Mon Sep 17 00:00:00 2001 From: Joosep Pata Date: Wed, 31 Jan 2024 10:40:07 +0200 Subject: [PATCH] towards v1.7: new CMS datasets, CLIC hit-based datasets, TF backward-compat optimizations (#285) * training with cms 1.7.0 * fix postprocessing for new uproot * track memory usage of pandora * readd dim decrease options * optimizer save/restore * hypertune * update track feats * add pytorch training on clic hits --------- Co-authored-by: Joosep Pata --- .github/workflows/pre-commit.yml | 4 +- .github/workflows/test.yml | 18 +- mlpf/data_cms/genjob_nopu.sh | 4 +- mlpf/data_cms/genjob_pu55to75.sh | 4 +- mlpf/data_cms/prepare_args.py | 18 +- mlpf/heptfds/clic_pf_edm4hep/single_gamma.py | 78 +++++ mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py | 78 +++++ mlpf/heptfds/clic_pf_edm4hep/single_pi.py | 78 +++++ mlpf/heptfds/clic_pf_edm4hep/utils_edm.py | 71 +++-- mlpf/heptfds/clic_pf_edm4hep_hits/qq.py | 3 +- mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py | 3 +- .../clic_pf_edm4hep_hits/single_ele.py | 3 +- .../clic_pf_edm4hep_hits/single_gamma.py | 3 +- .../clic_pf_edm4hep_hits/single_kaon0L.py | 3 +- .../heptfds/clic_pf_edm4hep_hits/single_mu.py | 3 +- .../clic_pf_edm4hep_hits/single_neutron.py | 3 +- .../heptfds/clic_pf_edm4hep_hits/single_pi.py | 3 +- .../clic_pf_edm4hep_hits/single_pi0.py | 3 +- mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py | 3 +- .../heptfds/clic_pf_edm4hep_hits/ttbar_10k.py | 3 +- .../heptfds/clic_pf_edm4hep_hits/utils_edm.py | 3 +- mlpf/heptfds/cms_pf/cms_utils.py | 13 + mlpf/heptfds/cms_pf/multiparticlegun.py | 3 +- mlpf/heptfds/cms_pf/qcd.py | 3 +- mlpf/heptfds/cms_pf/qcd_high_pt.py | 3 +- mlpf/heptfds/cms_pf/singleele.py | 3 +- mlpf/heptfds/cms_pf/singlegamma.py | 3 +- mlpf/heptfds/cms_pf/singlemu.py | 5 +- mlpf/heptfds/cms_pf/singleneutron.py | 3 +- mlpf/heptfds/cms_pf/singlepi.py | 3 +- mlpf/heptfds/cms_pf/singlepi0.py | 3 +- mlpf/heptfds/cms_pf/singleproton.py | 3 +- mlpf/heptfds/cms_pf/singletau.py | 3 +- mlpf/heptfds/cms_pf/smst1tttt.py | 3 +- mlpf/heptfds/cms_pf/ttbar.py | 4 +- mlpf/heptfds/cms_pf/vbf.py | 61 ++++ mlpf/heptfds/cms_pf/ztt.py | 3 +- mlpf/pipeline.py | 8 +- mlpf/pyg/PFDataset.py | 24 +- mlpf/pyg/training.py | 25 +- mlpf/pyg/utils.py | 39 ++- mlpf/pyg_pipeline.py | 19 +- mlpf/tfmodel/hypertuning.py | 10 +- mlpf/tfmodel/model.py | 4 + mlpf/tfmodel/model_setup.py | 33 ++- mlpf/tfmodel/utils.py | 10 +- .../paper_plots_2023_bin_size_ablation.ipynb | 182 ++++++++++++ parameters/cms-transformer.yaml | 280 ------------------ parameters/pyg-cms-physical.yaml | 125 -------- parameters/pyg-cms-small.yaml | 119 -------- parameters/pyg-cms-test-qcdhighpt.yaml | 101 ------- parameters/pyg-workflow-test.yaml | 97 ------ .../pyg-clic-hits.yaml} | 48 ++- parameters/{ => pytorch}/pyg-clic.yaml | 1 + parameters/{ => pytorch}/pyg-cms.yaml | 116 +++----- parameters/{ => pytorch}/pyg-delphes.yaml | 1 + .../bench/clic-hits-bench.yaml | 16 +- .../{ => tensorflow}/bench/delphes-bench.yaml | 0 parameters/{ => tensorflow}/clic-hits.yaml | 54 ++-- parameters/{ => tensorflow}/clic.yaml | 2 + .../binsize/clic_bin_size_128.yaml | 260 ++++++++++++++++ .../binsize/clic_bin_size_256.yaml | 260 ++++++++++++++++ .../binsize/clic_bin_size_32.yaml | 260 ++++++++++++++++ .../binsize/clic_bin_size_512.yaml} | 60 +++- .../binsize/clic_bin_size_64.yaml | 260 ++++++++++++++++ .../{cms-gen.yaml => tensorflow/cms.yaml} | 55 ++-- parameters/{ => tensorflow}/delphes.yaml | 0 scripts/fcc/clicRec_e4h_input.py | 60 ++-- scripts/fcc/postprocessing.py | 53 +++- scripts/fcc/postprocessing_hits.py | 28 +- scripts/fcc/run_pandora_timing.sh | 15 +- scripts/fcc/run_sim_gun_np.sh | 2 +- scripts/generate_tfds.sh | 36 +-- scripts/local_test_clic_hits_pipeline.sh | 2 +- scripts/local_test_clic_pipeline.sh | 2 +- scripts/local_test_cms_pipeline.sh | 4 +- scripts/local_test_delphes_pipeline.sh | 2 +- scripts/local_test_delphes_pytorch.sh | 63 ---- scripts/local_test_pyg.sh | 9 +- scripts/lumi/clic_bin_size_128.sh | 43 +++ scripts/lumi/clic_bin_size_256.sh | 43 +++ scripts/lumi/clic_bin_size_32.sh | 43 +++ scripts/lumi/clic_bin_size_512.sh | 43 +++ scripts/lumi/clic_bin_size_64.sh | 43 +++ scripts/lumi/pytorch.sh | 12 +- scripts/lumi/train-gpu-1.sh | 13 +- scripts/lumi/train-gpu-4.sh | 9 +- scripts/lumi/train-gpu-8.sh | 18 +- scripts/tallinn/a100/clic-hits-train.sh | 4 +- scripts/tallinn/a100/clic-train-hvd.sh | 20 -- scripts/tallinn/a100/clic-train.sh | 6 +- scripts/tallinn/a100/cms-train.sh | 2 +- scripts/tallinn/a100/eval.sh | 14 +- scripts/tallinn/a100/pytorch.sh | 4 +- scripts/tallinn/rtx/clic-train.sh | 2 +- scripts/tallinn/rtx/clic.sh | 16 + scripts/tallinn/rtx/delphes-train.sh | 2 +- scripts/tallinn/rtx/eval.sh | 4 +- scripts/tallinn/rtx/pytorch.sh | 53 ++-- 99 files changed, 2418 insertions(+), 1229 deletions(-) create mode 100644 mlpf/heptfds/clic_pf_edm4hep/single_gamma.py create mode 100644 mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py create mode 100644 mlpf/heptfds/clic_pf_edm4hep/single_pi.py create mode 100644 mlpf/heptfds/cms_pf/vbf.py create mode 100644 notebooks/clic/paper_plots_2023_bin_size_ablation.ipynb delete mode 100644 parameters/cms-transformer.yaml delete mode 100644 parameters/pyg-cms-physical.yaml delete mode 100644 parameters/pyg-cms-small.yaml delete mode 100644 parameters/pyg-cms-test-qcdhighpt.yaml delete mode 100644 parameters/pyg-workflow-test.yaml rename parameters/{pyg-cms-small-highqcd.yaml => pytorch/pyg-clic-hits.yaml} (77%) rename parameters/{ => pytorch}/pyg-clic.yaml (99%) rename parameters/{ => pytorch}/pyg-cms.yaml (60%) rename parameters/{ => pytorch}/pyg-delphes.yaml (99%) rename parameters/{ => tensorflow}/bench/clic-hits-bench.yaml (96%) rename parameters/{ => tensorflow}/bench/delphes-bench.yaml (100%) rename parameters/{ => tensorflow}/clic-hits.yaml (88%) rename parameters/{ => tensorflow}/clic.yaml (98%) create mode 100644 parameters/tensorflow/clic_studies/binsize/clic_bin_size_128.yaml create mode 100644 parameters/tensorflow/clic_studies/binsize/clic_bin_size_256.yaml create mode 100644 parameters/tensorflow/clic_studies/binsize/clic_bin_size_32.yaml rename parameters/{clic-test.yaml => tensorflow/clic_studies/binsize/clic_bin_size_512.yaml} (80%) create mode 100644 parameters/tensorflow/clic_studies/binsize/clic_bin_size_64.yaml rename parameters/{cms-gen.yaml => tensorflow/cms.yaml} (90%) rename parameters/{ => tensorflow}/delphes.yaml (100%) delete mode 100755 scripts/local_test_delphes_pytorch.sh create mode 100644 scripts/lumi/clic_bin_size_128.sh create mode 100644 scripts/lumi/clic_bin_size_256.sh create mode 100755 scripts/lumi/clic_bin_size_32.sh create mode 100644 scripts/lumi/clic_bin_size_512.sh create mode 100755 scripts/lumi/clic_bin_size_64.sh delete mode 100755 scripts/tallinn/a100/clic-train-hvd.sh create mode 100755 scripts/tallinn/rtx/clic.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 7b4a8b148..1d46f188d 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -9,10 +9,10 @@ on: jobs: lint: name: Lint PR or Push to main - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: - python-version: [3.8.10] + python-version: [3.10.12] steps: - name: Checkout diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6e7723eb9..056f30201 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt deps-pyg: @@ -25,7 +25,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: pip3 install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu @@ -38,7 +38,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: PYTHONPATH=. python3 -m unittest tests/test_tf.py @@ -50,7 +50,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: ./scripts/local_test_clic_pipeline.sh @@ -62,7 +62,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: ./scripts/local_test_clic_hits_pipeline.sh @@ -74,7 +74,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: ./scripts/local_test_delphes_pipeline.sh @@ -86,7 +86,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: ./scripts/local_test_cms_pipeline.sh @@ -98,7 +98,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: pip3 install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu @@ -112,7 +112,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10.12" cache: "pip" - run: pip install -r requirements.txt - run: pip3 install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu diff --git a/mlpf/data_cms/genjob_nopu.sh b/mlpf/data_cms/genjob_nopu.sh index 351e0d022..7e2c445fa 100755 --- a/mlpf/data_cms/genjob_nopu.sh +++ b/mlpf/data_cms/genjob_nopu.sh @@ -71,8 +71,8 @@ pwd ls -lrt echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py -cmsRun step2_phase1_new.py -cmsRun step3_phase1_new.py +cmsRun step2_phase1_new.py > /dev/null +cmsRun step3_phase1_new.py > /dev/null #cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh index 957999fdb..d035f0122 100755 --- a/mlpf/data_cms/genjob_pu55to75.sh +++ b/mlpf/data_cms/genjob_pu55to75.sh @@ -72,8 +72,8 @@ pwd ls -lrt echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py -cmsRun step2_phase1_new.py -cmsRun step3_phase1_new.py +cmsRun step2_phase1_new.py > /dev/null +cmsRun step3_phase1_new.py > /dev/null #cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py index 28b83feb7..626ced610 100644 --- a/mlpf/data_cms/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -16,14 +16,16 @@ ("TTbar_14TeV_TuneCUETP8M1_cfi", 700000, 701000, "genjob_nopu.sh", outdir + "/nopu"), ("MultiParticlePFGun50_cfi", 800000, 810000, "genjob_nopu.sh", outdir + "/nopu"), - ("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 901000, "genjob_nopu.sh", outdir + "/nopu"), - ("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1001000, "genjob_nopu.sh", outdir + "/nopu"), - ("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1101000, "genjob_nopu.sh", outdir + "/nopu"), - ("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1201000, "genjob_nopu.sh", outdir + "/nopu"), - ("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1301000, "genjob_nopu.sh", outdir + "/nopu"), - ("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1401000, "genjob_nopu.sh", outdir + "/nopu"), - ("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1501000, "genjob_nopu.sh", outdir + "/nopu"), - ("SingleTauFlatPt1To1000_cfi", 1600000,1601000, "genjob_nopu.sh", outdir + "/nopu"), + ("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 910000, "genjob_nopu.sh", outdir + "/nopu"), + ("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1010000, "genjob_nopu.sh", outdir + "/nopu"), + ("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1110000, "genjob_nopu.sh", outdir + "/nopu"), + ("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1210000, "genjob_nopu.sh", outdir + "/nopu"), + ("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1310000, "genjob_nopu.sh", outdir + "/nopu"), + ("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1410000, "genjob_nopu.sh", outdir + "/nopu"), + ("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1510000, "genjob_nopu.sh", outdir + "/nopu"), + ("SingleTauFlatPt1To1000_cfi", 1600000,1610000, "genjob_nopu.sh", outdir + "/nopu"), + + ("VBF_TuneCP5_14TeV_pythia8_cfi", 1700000,1705010, "genjob_pu55to75.sh", outdir + "/pu55to75"), ] if __name__ == "__main__": diff --git a/mlpf/heptfds/clic_pf_edm4hep/single_gamma.py b/mlpf/heptfds/clic_pf_edm4hep/single_gamma.py new file mode 100644 index 000000000..885d371e4 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/single_gamma.py @@ -0,0 +1,78 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds +import numpy as np + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with single gamma particle gun + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event +""" + +_CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 +""" + + +class ClicEdmSingleGammaPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.5.0": "Regenerate with ARRAY_RECORD", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleGammaPf, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + path = dl_manager.manual_dir + return split_sample(Path(path / "gamma/")) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py b/mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py new file mode 100644 index 000000000..dcac642bf --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py @@ -0,0 +1,78 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds +import numpy as np + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with single kaon0L particle gun + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event +""" + +_CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 +""" + + +class ClicEdmSingleKaon0lPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.5.0": "Regenerate with ARRAY_RECORD", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSingleKaon0lPf, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + path = dl_manager.manual_dir + return split_sample(Path(path / "kaon0L/")) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/single_pi.py b/mlpf/heptfds/clic_pf_edm4hep/single_pi.py new file mode 100644 index 000000000..b97c356b3 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/single_pi.py @@ -0,0 +1,78 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample_several, +) + +import tensorflow_datasets as tfds +import numpy as np + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with single-pion particle gun + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event +""" + +_CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 +""" + + +class ClicEdmSinglePiPf(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.5.0": "Regenerate with ARRAY_RECORD", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmSinglePiPf, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + path = dl_manager.manual_dir + return split_sample_several([Path(path / "pi-/"), Path(path / "pi+/")]) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index ef5baff85..b7d66c0d9 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -2,6 +2,7 @@ import fastjet import numpy as np import vector +import random jetdef = fastjet.JetDefinition(fastjet.ee_genkt_algorithm, 0.7, -1.0) min_jet_pt = 5.0 # GeV @@ -48,6 +49,9 @@ Y_FEATURES = ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "jet_idx"] labels = [0, 211, 130, 22, 11, 13] +N_X_FEATURES = max(len(X_FEATURES_CL), len(X_FEATURES_TRK)) +N_Y_FEATURES = len(Y_FEATURES) + def split_sample(path, test_frac=0.8): files = sorted(list(path.glob("*.parquet"))) @@ -65,29 +69,23 @@ def split_sample(path, test_frac=0.8): def split_sample_several(paths, test_frac=0.8): - files_train_tot = [] - files_test_tot = [] - for path in paths: - files = sorted(list(path.glob("*.parquet"))) - print("Found {} files in {}".format(files, path)) - assert len(files) > 0 - idx_split = int(test_frac * len(files)) - files_train = files[:idx_split] - files_test = files[idx_split:] - assert len(files_train) > 0 - assert len(files_test) > 0 - files_train_tot.append(files_train) - files_test_tot.append(files_test) - + files = sum([list(path.glob("*.parquet")) for path in paths], []) + random.shuffle(files) + print("Found {} files".format(len(files))) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 return { - "train": generate_examples(files_train_tot), - "test": generate_examples(files_test_tot), + "train": generate_examples(files_train), + "test": generate_examples(files_test), } def prepare_data_clic(fn, with_jet_idx=True): ret = ak.from_parquet(fn) - X_track = ret["X_track"] X_cluster = ret["X_cluster"] @@ -102,30 +100,45 @@ def prepare_data_clic(fn, with_jet_idx=True): X1 = ak.to_numpy(X_track[iev]) X2 = ak.to_numpy(X_cluster[iev]) - if len(X1) == 0 or len(X2) == 0: + if len(X1) == 0 and len(X2) == 0: continue + if len(X1) == 0: + X1 = np.zeros((0, N_X_FEATURES)) + if len(X2) == 0: + X2 = np.zeros((0, N_X_FEATURES)) + ygen_track = ak.to_numpy(ret["ygen_track"][iev]) ygen_cluster = ak.to_numpy(ret["ygen_cluster"][iev]) ycand_track = ak.to_numpy(ret["ycand_track"][iev]) ycand_cluster = ak.to_numpy(ret["ycand_cluster"][iev]) - if len(ygen_track) == 0 or len(ygen_cluster) == 0: + if len(ygen_track) == 0 and len(ygen_cluster) == 0: continue + if len(ygen_track) == 0: + ygen_track = np.zeros((0, N_Y_FEATURES - 1)) + if len(ygen_cluster) == 0: + ygen_cluster = np.zeros((0, N_Y_FEATURES - 1)) + if len(ycand_track) == 0: + ycand_track = np.zeros((0, N_Y_FEATURES - 1)) + if len(ycand_cluster) == 0: + ycand_cluster = np.zeros((0, N_Y_FEATURES - 1)) + # pad feature dim between tracks and clusters to the same size - if X1.shape[1] < X2.shape[1]: - X1 = np.pad(X1, [[0, 0], [0, X2.shape[1] - X1.shape[1]]]) - if X2.shape[1] < X1.shape[1]: - X2 = np.pad(X2, [[0, 0], [0, X1.shape[1] - X2.shape[1]]]) + if X1.shape[1] < N_X_FEATURES: + X1 = np.pad(X1, [[0, 0], [0, N_X_FEATURES - X1.shape[1]]]) + if X2.shape[1] < N_X_FEATURES: + X2 = np.pad(X2, [[0, 0], [0, N_X_FEATURES - X2.shape[1]]]) # concatenate tracks and clusters in features and targets X = np.concatenate([X1, X2]) ygen = np.concatenate([ygen_track, ygen_cluster]) ycand = np.concatenate([ycand_track, ycand_cluster]) - assert ygen.shape[0] == X.shape[0] - assert ycand.shape[0] == X.shape[0] + if (ygen.shape[0] != X.shape[0]) or (ycand.shape[0] != X.shape[0]): + print(X.shape, ygen.shape, ycand.shape) + continue # add jet_idx column if with_jet_idx: @@ -194,6 +207,7 @@ def prepare_data_clic(fn, with_jet_idx=True): def generate_examples(files, with_jet_idx=True): for fi in files: + print(fi) Xs, ygens, ycands = prepare_data_clic(fi, with_jet_idx=with_jet_idx) for iev in range(len(Xs)): yield str(fi) + "_" + str(iev), { @@ -204,5 +218,10 @@ def generate_examples(files, with_jet_idx=True): if __name__ == "__main__": - for ex in generate_examples(["data/p8_ee_ZZ_fullhad_ecm365/reco_p8_ee_ZZ_fullhad_ecm365_1.parquet"]): + for ex in generate_examples( + [ + "/local/joosep/mlpf/clic_edm4hep/pi+/reco_pi+_98.parquet", + "/local/joosep/mlpf/clic_edm4hep/pi-/reco_pi-_11.parquet", + ] + ): print(ex[0], ex[1]["X"].shape, ex[1]["ygen"].shape, ex[1]["ycand"].shape) diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py b/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py index fdf43fd57..adbaf782d 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/qq.py @@ -26,13 +26,14 @@ class ClicEdmQqHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "0.9.0": "Small stats", "1.0.0": "Initial release", "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py b/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py index e4be36050..d0a21ee60 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py @@ -15,9 +15,10 @@ class ClicEdmQqHitsPf10k(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py index 22ff25578..9d6978724 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_ele.py @@ -26,11 +26,12 @@ class ClicEdmSingleElectronHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticels", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py index ab4493370..ad4a0e018 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_gamma.py @@ -26,11 +26,12 @@ class ClicEdmSingleGammaHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py index a5cc947f3..1edda0adc 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L.py @@ -26,11 +26,12 @@ class ClicEdmSingleKaon0lHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py index a40cc466d..37abbb3fc 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_mu.py @@ -26,11 +26,12 @@ class ClicEdmSingleMuonHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py index ef9569259..6ad530258 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_neutron.py @@ -26,11 +26,12 @@ class ClicEdmSingleNeutronHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py index 8b4ca7b0e..33d498b51 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi.py @@ -26,11 +26,12 @@ class ClicEdmSinglePiHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py index 6570fd8a6..6a9694007 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0.py @@ -26,11 +26,12 @@ class ClicEdmSinglePi0HitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py index 0c9cb3d4e..0d7832737 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py @@ -26,13 +26,14 @@ class ClicEdmTtbarHitsPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "0.9.0": "Small stats", "1.0.0": "Initial release", "1.1.0": "Remove track referencepoint feature", "1.2.0": "Keep all interacting genparticles", "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py index 528ba57b6..6ffad1784 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py @@ -15,9 +15,10 @@ class ClicEdmTtbarHitsPf10k(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("1.5.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.5.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Update track features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py index 7f8cc19c8..84309c462 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py @@ -13,13 +13,14 @@ "p", "chi2", "ndf", + "dEdx", + "dEdxError", "radiusOfInnermostHit", "tanLambda", "D0", "omega", "Z0", "time", - "type", ] X_FEATURES_CH = [ "elemtype", diff --git a/mlpf/heptfds/cms_pf/cms_utils.py b/mlpf/heptfds/cms_pf/cms_utils.py index 46dee72eb..ebd592bb4 100644 --- a/mlpf/heptfds/cms_pf/cms_utils.py +++ b/mlpf/heptfds/cms_pf/cms_utils.py @@ -90,6 +90,19 @@ "lambdaerror", "theta", "thetaerror", + "time", + "timeerror", + "etaerror1", + "etaerror2", + "etaerror3", + "etaerror4", + "phierror1", + "phierror2", + "phierror3", + "phierror4", + "sigma_x", + "sigma_y", + "sigma_z", ] Y_FEATURES = [ diff --git a/mlpf/heptfds/cms_pf/multiparticlegun.py b/mlpf/heptfds/cms_pf/multiparticlegun.py index 57b67e57a..7f737425e 100644 --- a/mlpf/heptfds/cms_pf/multiparticlegun.py +++ b/mlpf/heptfds/cms_pf/multiparticlegun.py @@ -21,10 +21,11 @@ class CmsPfMultiParticleGun(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_multi_particle_gun dataset.""" - VERSION = tfds.core.Version("1.6.1") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.6.0": "Initial release", "1.6.1": "Additional stats", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/qcd.py b/mlpf/heptfds/cms_pf/qcd.py index e0b450581..4d5a200d5 100644 --- a/mlpf/heptfds/cms_pf/qcd.py +++ b/mlpf/heptfds/cms_pf/qcd.py @@ -21,7 +21,7 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", @@ -29,6 +29,7 @@ class CmsPfQcd(tfds.core.GeneratorBasedBuilder): "1.5.0": "No padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_qcd ~/tensorflow_datasets/ diff --git a/mlpf/heptfds/cms_pf/qcd_high_pt.py b/mlpf/heptfds/cms_pf/qcd_high_pt.py index fe038f489..d88bd3514 100644 --- a/mlpf/heptfds/cms_pf/qcd_high_pt.py +++ b/mlpf/heptfds/cms_pf/qcd_high_pt.py @@ -21,7 +21,7 @@ class CmsPfQcdHighPt(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_qcd_high_pt dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", @@ -29,6 +29,7 @@ class CmsPfQcdHighPt(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/singleele.py b/mlpf/heptfds/cms_pf/singleele.py index b90a645d4..0cf50e192 100644 --- a/mlpf/heptfds/cms_pf/singleele.py +++ b/mlpf/heptfds/cms_pf/singleele.py @@ -21,7 +21,7 @@ class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singleele dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Initial release.", @@ -30,6 +30,7 @@ class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/singlegamma.py b/mlpf/heptfds/cms_pf/singlegamma.py index fdda8c8bf..2200a8ea0 100644 --- a/mlpf/heptfds/cms_pf/singlegamma.py +++ b/mlpf/heptfds/cms_pf/singlegamma.py @@ -21,7 +21,7 @@ class CmsPfSingleGamma(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlegamma dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", @@ -29,6 +29,7 @@ class CmsPfSingleGamma(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/singlemu.py b/mlpf/heptfds/cms_pf/singlemu.py index 59ccda34f..4a8adddc5 100644 --- a/mlpf/heptfds/cms_pf/singlemu.py +++ b/mlpf/heptfds/cms_pf/singlemu.py @@ -21,7 +21,7 @@ class CmsPfSingleMu(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlemu dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -29,6 +29,7 @@ class CmsPfSingleMu(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_single_mu ~/tensorflow_datasets/ @@ -60,7 +61,7 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.manual_dir - sample_dir = "SingleMuFlatLogPt_100MeVto2TeV_cfi" + sample_dir = "SingleMuFlatPt1To1000_pythia8_cfi" return cms_utils.split_sample(path / sample_dir / "raw") def _generate_examples(self, files): diff --git a/mlpf/heptfds/cms_pf/singleneutron.py b/mlpf/heptfds/cms_pf/singleneutron.py index 8f9d5168a..e2c0debb4 100644 --- a/mlpf/heptfds/cms_pf/singleneutron.py +++ b/mlpf/heptfds/cms_pf/singleneutron.py @@ -21,7 +21,7 @@ class CmsPfSingleNeutron(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singleneutron dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", @@ -29,6 +29,7 @@ class CmsPfSingleNeutron(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/singlepi.py b/mlpf/heptfds/cms_pf/singlepi.py index ba74aa853..e587cabeb 100644 --- a/mlpf/heptfds/cms_pf/singlepi.py +++ b/mlpf/heptfds/cms_pf/singlepi.py @@ -21,7 +21,7 @@ class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -30,6 +30,7 @@ class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_single_pi ~/tensorflow_datasets/ diff --git a/mlpf/heptfds/cms_pf/singlepi0.py b/mlpf/heptfds/cms_pf/singlepi0.py index 69b97b120..df997621f 100644 --- a/mlpf/heptfds/cms_pf/singlepi0.py +++ b/mlpf/heptfds/cms_pf/singlepi0.py @@ -21,7 +21,7 @@ class CmsPfSinglePi0(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi0 dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", @@ -29,6 +29,7 @@ class CmsPfSinglePi0(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/singleproton.py b/mlpf/heptfds/cms_pf/singleproton.py index fc52efbaa..65e72668e 100644 --- a/mlpf/heptfds/cms_pf/singleproton.py +++ b/mlpf/heptfds/cms_pf/singleproton.py @@ -23,7 +23,7 @@ class CmsPfSingleProton(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singleproton dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Initial release", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", @@ -31,6 +31,7 @@ class CmsPfSingleProton(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/singletau.py b/mlpf/heptfds/cms_pf/singletau.py index f5dbe99ef..4231fff62 100644 --- a/mlpf/heptfds/cms_pf/singletau.py +++ b/mlpf/heptfds/cms_pf/singletau.py @@ -23,7 +23,7 @@ class CmsPfSingleTau(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singletau dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.1.0": "Add muon type, fix electron GSF association", "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", @@ -31,6 +31,7 @@ class CmsPfSingleTau(tfds.core.GeneratorBasedBuilder): "1.5.0": "Without padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/smst1tttt.py b/mlpf/heptfds/cms_pf/smst1tttt.py index 2201a9d78..05c4cb830 100644 --- a/mlpf/heptfds/cms_pf/smst1tttt.py +++ b/mlpf/heptfds/cms_pf/smst1tttt.py @@ -21,9 +21,10 @@ class CmsPfSmsT1tttt(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress \ diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py index 51e436144..34d67895a 100644 --- a/mlpf/heptfds/cms_pf/ttbar.py +++ b/mlpf/heptfds/cms_pf/ttbar.py @@ -21,7 +21,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -32,6 +32,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): "1.5.0": "No padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/ @@ -43,7 +44,6 @@ def __init__(self, *args, **kwargs): def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" - # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object return tfds.core.DatasetInfo( builder=self, description=_DESCRIPTION, diff --git a/mlpf/heptfds/cms_pf/vbf.py b/mlpf/heptfds/cms_pf/vbf.py new file mode 100644 index 000000000..e91cab2fe --- /dev/null +++ b/mlpf/heptfds/cms_pf/vbf.py @@ -0,0 +1,61 @@ +"""CMS PF TTbar dataset.""" +import cms_utils +import tensorflow as tf + +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +VBF events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + + +class CmsPfVbf(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.7.0") + RELEASE_NOTES = { + "1.7.0": "Add cluster shape vars", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_vbf ~/tensorflow_datasets/ + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfVbf, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "VBF_TuneCP5_14TeV_pythia8_cfi" + return cms_utils.split_sample(path / sample_dir / "raw") + + def _generate_examples(self, files): + return cms_utils.generate_examples(files) diff --git a/mlpf/heptfds/cms_pf/ztt.py b/mlpf/heptfds/cms_pf/ztt.py index 60d984bc2..0e6ad7fa6 100644 --- a/mlpf/heptfds/cms_pf/ztt.py +++ b/mlpf/heptfds/cms_pf/ztt.py @@ -21,7 +21,7 @@ class CmsPfZtt(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_ztt dataset.""" - VERSION = tfds.core.Version("1.6.0") + VERSION = tfds.core.Version("1.7.0") RELEASE_NOTES = { "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", "1.3.1": "Remove PS again", @@ -29,6 +29,7 @@ class CmsPfZtt(tfds.core.GeneratorBasedBuilder): "1.5.0": "No padding", "1.5.1": "Remove outlier caps", "1.6.0": "Regenerate with ARRAY_RECORD", + "1.7.0": "Add cluster shape vars", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ztt ~/tensorflow_datasets/ diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index 4b08894fd..4b4097869 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -686,7 +686,7 @@ def find_lr(config, outdir, figname, logscale): ) with strategy.scope(): - model, _, _ = model_scope(config, 1) + model, _, initial_epoch = model_scope(config, 1) max_steps = 200 lr_finder = LRFinder(max_steps=max_steps) callbacks = [lr_finder] @@ -773,20 +773,22 @@ def hypertune(config, outdir, ntrain, ntest, recreate, num_cpus): tuner = get_tuner(config["hypertune"], model_builder, outdir, recreate, strategy) tuner.search_space_summary() + from tensorflow.keras.callbacks import TensorBoard + tuner.search( ds_train.tensorflow_dataset.repeat(), epochs=config["setup"]["num_epochs"], validation_data=ds_test.tensorflow_dataset.repeat(), steps_per_epoch=ds_train.num_steps(), validation_steps=ds_test.num_steps(), - callbacks=[], + callbacks=[TensorBoard(log_dir=outdir + "/logs")], ) logging.info("Hyperparameter search complete.") shutil.copy(config_file_path, outdir + "/config.yaml") # Copy the config file to the train dir for later reference tuner.results_summary() for trial in tuner.oracle.get_best_trials(num_trials=10): - logging.info(trial.hyperparameters.values, trial.score) + print(trial.hyperparameters.values, trial.score) # diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py index 6e632f594..f6651fd0f 100644 --- a/mlpf/pyg/PFDataset.py +++ b/mlpf/pyg/PFDataset.py @@ -10,14 +10,17 @@ from pyg.logger import _logger +import numpy as np + class TFDSDataSource: - def __init__(self, ds): + def __init__(self, ds, sort): self.ds = ds tmp = self.ds.dataset_info self.ds.dataset_info = SimpleNamespace() self.ds.dataset_info.name = tmp.name self.ds.dataset_info.features = tmp.features + self.sort = sort self.rep = self.ds.__repr__() def __getitem__(self, item): @@ -28,6 +31,13 @@ def __getitem__(self, item): if len(item) == 1: ret = ret[0] + # sorting the elements in pT descending order for the Mamba-based model + if self.sort: + sortidx = np.argsort(ret["X"][:, 1])[::-1] + ret["X"] = ret["X"][sortidx] + ret["ycand"] = ret["ycand"][sortidx] + ret["ygen"] = ret["ygen"][sortidx] + return ret def __len__(self): @@ -40,7 +50,7 @@ def __repr__(self): class PFDataset: """Builds a DataSource from tensorflow datasets.""" - def __init__(self, data_dir, name, split, num_samples=None): + def __init__(self, data_dir, name, split, num_samples=None, sort=False): """ Args data_dir: path to tensorflow_datasets (e.g. `../data/tensorflow_datasets/`) @@ -53,7 +63,7 @@ def __init__(self, data_dir, name, split, num_samples=None): builder = tfds.builder(name, data_dir=data_dir) - self.ds = TFDSDataSource(builder.as_data_source(split=split)) + self.ds = TFDSDataSource(builder.as_data_source(split=split), sort=sort) if num_samples: self.ds = torch.utils.data.Subset(self.ds, range(num_samples)) @@ -185,7 +195,13 @@ def get_interleaved_dataloaders(world_size, rank, config, use_cuda, pad_3d, use_ for sample in config[f"{split}_dataset"][config["dataset"]][type_]["samples"]: version = config[f"{split}_dataset"][config["dataset"]][type_]["samples"][sample]["version"] - ds = PFDataset(config["data_dir"], f"{sample}:{version}", split, num_samples=config[f"n{split}"]).ds + ds = PFDataset( + config["data_dir"], + f"{sample}:{version}", + split, + num_samples=config[f"n{split}"], + sort=config["sort_data"], + ).ds if (rank == 0) or (rank == "cpu"): _logger.info(f"{split}_dataset: {sample}, {len(ds)}", color="blue") diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 3d1842736..1e551f72e 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -317,9 +317,11 @@ def train_mlpf( """ if (rank == 0) or (rank == "cpu"): - tensorboard_writer = SummaryWriter(f"{outdir}/runs/") + tensorboard_writer_train = SummaryWriter(f"{outdir}/runs/train") + tensorboard_writer_valid = SummaryWriter(f"{outdir}/runs/valid") else: - tensorboard_writer = None + tensorboard_writer_train = None + tensorboard_writer_valid = None t0_initial = time.time() @@ -414,14 +416,14 @@ def train_mlpf( if (rank == 0) or (rank == "cpu"): for k, v in losses_t.items(): - tensorboard_writer.add_scalar("epoch/train_loss_" + k, v, epoch) + tensorboard_writer_train.add_scalar("epoch/loss_" + k, v, epoch) for loss in losses_of_interest: losses["train"][loss].append(losses_t[loss]) losses["valid"][loss].append(losses_v[loss]) for k, v in losses_v.items(): - tensorboard_writer.add_scalar("epoch/valid_loss_" + k, v, epoch) + tensorboard_writer_valid.add_scalar("epoch/loss_" + k, v, epoch) t1 = time.time() @@ -464,8 +466,10 @@ def train_mlpf( with open(f"{outdir}/mlpf_losses.pkl", "wb") as f: pkl.dump(losses, f) - if tensorboard_writer: - tensorboard_writer.flush() + if tensorboard_writer_train: + tensorboard_writer_train.flush() + if tensorboard_writer_valid: + tensorboard_writer_valid.flush() if world_size > 1: dist.barrier() @@ -499,11 +503,18 @@ def run(rank, world_size, config, args, outdir, logfile): with open(f"{loaddir}/model_kwargs.pkl", "rb") as f: model_kwargs = pkl.load(f) - + _logger.info("model_kwargs: {}".format(model_kwargs)) model = MLPF(**model_kwargs).to(torch.device(rank)) optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"]) checkpoint = torch.load(config["load"], map_location=torch.device(rank)) + + for k in model.state_dict().keys(): + shp0 = model.state_dict()[k].shape + shp1 = checkpoint["model_state_dict"][k].shape + if shp0 != shp1: + raise Exception("shape mismatch in {}, {}!={}".format(k, shp0, shp1)) + testdir_name = "_" + Path(config["load"]).name if (rank == 0) or (rank == "cpu"): _logger.info("Loaded model weights from {}".format(config["load"]), color="bold") diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index a08facb75..9917846ed 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -12,22 +12,26 @@ "cms": [0, 211, 130, 1, 2, 22, 11, 13, 15], "delphes": [0, 211, 130, 22, 11, 13], "clic": [0, 211, 130, 22, 11, 13], + "clic_hits": [0, 211, 130, 22, 11, 13], } CLASS_NAMES_LATEX = { "cms": ["none", "Charged Hadron", "Neutral Hadron", "HFEM", "HFHAD", r"$\gamma$", r"$e^\pm$", r"$\mu^\pm$", r"$\tau$"], "delphes": ["none", "Charged Hadron", "Neutral Hadron", r"$\gamma$", r"$e^\pm$", r"$\mu^\pm$"], "clic": ["none", "Charged Hadron", "Neutral Hadron", r"$\gamma$", r"$e^\pm$", r"$\mu^\pm$"], + "clic_hits": ["none", "Charged Hadron", "Neutral Hadron", r"$\gamma$", r"$e^\pm$", r"$\mu^\pm$"], } CLASS_NAMES = { "cms": ["none", "chhad", "nhad", "HFEM", "HFHAD", "gamma", "ele", "mu", "tau"], "delphes": ["none", "chhad", "nhad", "gamma", "ele", "mu"], "clic": ["none", "chhad", "nhad", "gamma", "ele", "mu"], + "clic_hits": ["none", "chhad", "nhad", "gamma", "ele", "mu"], } CLASS_NAMES_CAPITALIZED = { "cms": ["none", "Charged hadron", "Neutral hadron", "HFEM", "HFHAD", "Photon", "Electron", "Muon", "Tau"], "delphes": ["none", "Charged hadron", "Neutral hadron", "Photon", "Electron", "Muon"], "clic": ["none", "Charged hadron", "Neutral hadron", "Photon", "Electron", "Muon"], + "clic_hits": ["none", "Charged hadron", "Neutral hadron", "Photon", "Electron", "Muon"], } X_FEATURES = { @@ -74,6 +78,19 @@ "lambdaerror", "theta", "thetaerror", + "time", + "timeerror", + "etaerror1", + "etaerror2", + "etaerror3", + "etaerror4", + "phierror1", + "phierror2", + "phierror3", + "phierror4", + "sigma_x", + "sigma_y", + "sigma_z", ], "delphes": [ "Track|cluster", @@ -108,9 +125,27 @@ "time | sigma_y", "Null | sigma_z", ], + "clic_hits": [ + "elemtype", + "pt | et", + "eta", + "sin_phi", + "cos_phi", + "p | energy", + "chi2 | position.x", + "ndf | position.y", + "dEdx | position.z", + "dEdxError | time", + "radiusOfInnermostHit | subdetector", + "tanLambda | type", + "D0 | Null", + "omega | Null", + "Z0 | Null", + "time | Null", + ], } -Y_FEATURES = ["cls_id", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "jet_idx"] +Y_FEATURES = ["cls_id", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy"] def unpack_target(y): @@ -130,7 +165,7 @@ def unpack_target(y): # assert torch.all(ret["energy"] >= 0.0) # energy # note ~ momentum = ["pt", "eta", "sin_phi", "cos_phi", "energy"] - ret["momentum"] = y[..., 2:-1].to(dtype=torch.float32) + ret["momentum"] = y[..., 2:7].to(dtype=torch.float32) ret["p4"] = torch.cat( [ret["pt"].unsqueeze(1), ret["eta"].unsqueeze(1), ret["phi"].unsqueeze(1), ret["energy"].unsqueeze(1)], axis=1 ) diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index 0ec9fc08d..6feddcc6d 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -29,7 +29,12 @@ "--gpu-batch-multiplier", type=int, default=None, help="Increase batch size per GPU by this constant factor" ) parser.add_argument( - "--dataset", type=str, default=None, choices=["clic", "cms", "delphes"], required=False, help="which dataset?" + "--dataset", + type=str, + default=None, + choices=["clic", "cms", "delphes", "clic_hits"], + required=False, + help="which dataset?", ) parser.add_argument("--num-workers", type=int, default=None, help="number of processes to load the data") parser.add_argument("--prefetch-factor", type=int, default=None, help="number of samples to fetch & prefetch at every call") @@ -65,6 +70,7 @@ parser.add_argument("--comet-offline", action="store_true", help="save comet logs locally") parser.add_argument("--comet-step-freq", type=int, default=None, help="step frequency for saving comet metrics") parser.add_argument("--experiments-dir", type=str, default=None, help="base directory within which trainings are stored") +parser.add_argument("--pipeline", action="store_true", default=None, help="test is running in pipeline") def main(): @@ -76,6 +82,17 @@ def main(): with open(args.config, "r") as stream: # load config (includes: which physics samples, model params) config = yaml.safe_load(stream) + # override some options for the pipeline test + if args.pipeline: + if config["dataset"] == "cms": + for ds in ["train_dataset", "test_dataset", "valid_dataset"]: + config[ds]["cms"] = { + "physical": { + "batch_size": config[ds]["cms"]["physical"]["batch_size"], + "samples": {"cms_pf_ttbar": config[ds]["cms"]["physical"]["samples"]["cms_pf_ttbar"]}, + } + } + # override loaded config with values from command line args config = override_config(config, args) diff --git a/mlpf/tfmodel/hypertuning.py b/mlpf/tfmodel/hypertuning.py index f0e2fdf1c..3ace72d82 100644 --- a/mlpf/tfmodel/hypertuning.py +++ b/mlpf/tfmodel/hypertuning.py @@ -6,12 +6,12 @@ def get_model_builder(config, total_steps): lr_schedule, optim_callbacks, lr = get_lr_schedule(config, steps=total_steps) def model_builder(hp): - node_encoding_hidden_dim = hp.Choice("node_dim", values=[128, 256, 512]) + node_encoding_hidden_dim = hp.Choice("node_dim", values=[128, 256, 512, 1024]) config["parameters"]["node_encoding_hidden_dim"] = node_encoding_hidden_dim - config["parameters"]["num_graph_layers_id"] = hp.Choice("num_graph_layers_id", [1, 2, 3]) - config["parameters"]["num_graph_layers_reg"] = hp.Choice("num_graph_layers_reg", [1, 2, 3]) + config["parameters"]["num_graph_layers_id"] = hp.Choice("num_graph_layers_id", [1, 2, 3, 4, 5]) + config["parameters"]["num_graph_layers_reg"] = hp.Choice("num_graph_layers_reg", [1, 2, 3, 4, 5]) config["parameters"]["combined_graph_layer"]["dropout"] = hp.Choice("cg_dropout", values=[0.0, 0.1, 0.2]) config["parameters"]["combined_graph_layer"]["num_node_messages"] = hp.Choice("num_node_messages", [1, 2]) @@ -29,12 +29,14 @@ def model_builder(hp): config["parameters"]["output_decoding"]["dropout"] = hp.Choice("output_dropout", values=[0.0, 0.1, 0.2]) config["parameters"]["output_decoding"]["layernorm"] = hp.Choice("output_layernorm", values=[True, False]) config["parameters"]["output_decoding"]["mask_reg_cls0"] = hp.Choice("output_mask_reg_cls0", values=[True, False]) + config["parameters"]["skip_connection"] = hp.Choice("skip_connection", values=[True, False]) + config["parameters"]["node_update_mode"] = hp.Choice("node_update_mode", values=["additive", "concat"]) model = make_model(config, dtype="float32") model.build( ( 1, - config["dataset"]["padded_num_elem_size"], + None, config["dataset"]["num_input_features"], ) ) diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py index f4cfa8a78..aa0eac923 100644 --- a/mlpf/tfmodel/model.py +++ b/mlpf/tfmodel/model.py @@ -767,6 +767,8 @@ def __init__( charge_dim_decrease=True, eta_dim_decrease=False, phi_dim_decrease=False, + pt_dim_decrease=False, + energy_dim_decrease=False, pt_as_correction=True, id_hidden_dim=128, charge_hidden_dim=128, @@ -830,6 +832,7 @@ def __init__( num_layers=pt_num_layers, activation=activation, dropout=dropout, + dim_decrease=pt_dim_decrease, ) self.ffn_eta = point_wise_feed_forward_network( @@ -860,6 +863,7 @@ def __init__( num_layers=energy_num_layers, activation=activation, dropout=dropout, + dim_decrease=energy_dim_decrease, ) """ diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py index 6d31ffcf6..03f5b8546 100644 --- a/mlpf/tfmodel/model_setup.py +++ b/mlpf/tfmodel/model_setup.py @@ -35,12 +35,22 @@ class ModelOptimizerCheckpoint(tf.keras.callbacks.ModelCheckpoint): - def on_epoch_end(self, epoch, logs=None): - super(ModelOptimizerCheckpoint, self).on_epoch_end(epoch, logs=logs) - weightfile_path = self.opt_path.format(epoch=epoch + 1, **logs) + def save_opt_weights(self, logs): + weightfile_path = self.opt_path.format(epoch=self._current_epoch + 1, **logs) weights = {} - self.model.optimizer.save_own_variables(weights) + + try: + self.model.optimizer.save_own_variables(weights) + except Exception as e: + print("could not save optimizer weights with save_own_variables: {}".format(e)) + + # TF 2.12 compatibility + if len(weights) == 0: + for i, variable in enumerate(self.model.optimizer.variables): + weights[str(i)] = variable.numpy() + with open(weightfile_path, "wb") as fi: + print("saving {} optimizer weights to {}".format(len(weights), weightfile_path)) pickle.dump( { # "lr": lr, @@ -49,6 +59,15 @@ def on_epoch_end(self, epoch, logs=None): fi, ) + def on_epoch_end(self, epoch, logs=None): + super(ModelOptimizerCheckpoint, self).on_epoch_end(epoch, logs=logs) + self.save_opt_weights(logs) + + def on_train_batch_end(self, batch, logs=None): + super(ModelOptimizerCheckpoint, self).on_train_batch_end(batch, logs=logs) + if isinstance(self.save_freq, int) and batch > 0 and batch % self.save_freq == 0: + self.save_opt_weights(logs) + class CustomCallback(tf.keras.callbacks.Callback): def __init__( @@ -212,6 +231,12 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h if config.get("do_checkpoint_callback", True): callbacks += [cp_callback] + cp_callback = ModelOptimizerCheckpoint( + filepath=str(cp_dir / "weights-{epoch:02d}-step.hdf5"), save_weights_only=True, verbose=1, save_freq=100 + ) + cp_callback.opt_path = str(cp_dir / "opt-{epoch:02d}-step.pkl") + callbacks += [cp_callback] + if not horovod_enabled: history_path = Path(outdir) / "history" history_path.mkdir(parents=True, exist_ok=True) diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py index 4161107a9..0501ee301 100644 --- a/mlpf/tfmodel/utils.py +++ b/mlpf/tfmodel/utils.py @@ -456,10 +456,11 @@ def load_and_interleave( ds._num_steps = num_steps statefile = f"{cachedir}/{ds.name}_{ds.split}.json" - if os.path.isfile(statefile): + if config["batching"]["bucket_by_sequence_length"] and os.path.isfile(statefile): ds.load_state(statefile) logging.info("Dataset {} after batching, {} steps, {} samples".format(ds.name, ds.num_steps(), ds.num_samples)) - ds.save_state(statefile) + if config["batching"]["bucket_by_sequence_length"]: + ds.save_state(statefile) return ds @@ -821,7 +822,10 @@ def model_weight_setting(): try: opt.load_own_variables(loaded_opt["weights"]) except Exception as e: - logging.error("could not restore optimizer: {}".format(e)) + logging.error("could not restore optimizer with load_own_variables: {}".format(e)) + # TF 2.12 compatibility + for i, variable in enumerate(opt.variables): + variable.assign(loaded_opt["weights"][str(i)]) logging.info("distributing optimizer state") strategy = tf.distribute.get_strategy() diff --git a/notebooks/clic/paper_plots_2023_bin_size_ablation.ipynb b/notebooks/clic/paper_plots_2023_bin_size_ablation.ipynb new file mode 100644 index 000000000..7d314ac5a --- /dev/null +++ b/notebooks/clic/paper_plots_2023_bin_size_ablation.ipynb @@ -0,0 +1,182 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "487144bf-29fe-4cac-bfd0-9c5d16bed2d7", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import glob\n", + "import pandas\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f2a5152-89a0-4a36-864e-11f6f471cc11", + "metadata": {}, + "outputs": [], + "source": [ + "paths = glob.glob(\"/home/joosep/particleflow/experiments-binsize-ablation/clic_bin_size_*/logs/history/history_49.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90cdf7ba-8de7-4e2d-bb95-10a2ac5109a6", + "metadata": {}, + "outputs": [], + "source": [ + "bin_sizes = []\n", + "val_jet_matched_frac = []\n", + "val_met_iqr = []\n", + "val_jet_iqr = []\n", + "val_loss = []\n", + "\n", + "for path in paths:\n", + " folder = path.split(\"/\")[5]\n", + " binsize = folder.split(\"_\")[3]\n", + " bin_sizes.append(int(binsize))\n", + " data = json.load(open(path))\n", + " val_jet_matched_frac.append(data[\"val_jet_matched_frac\"])\n", + " val_met_iqr.append(data[\"val_met_iqr\"])\n", + " val_jet_iqr.append(data[\"val_jet_iqr\"])\n", + " val_loss.append(data[\"val_loss\"])\n", + "df = pandas.DataFrame()\n", + "df[\"bin_size\"] = bin_sizes\n", + "df[\"val_jet_matched_frac\"] = val_jet_matched_frac\n", + "df[\"val_met_iqr\"] = val_met_iqr\n", + "df[\"val_jet_iqr\"] = val_jet_iqr\n", + "df[\"val_loss\"] = val_loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "623aa091-96b9-4bda-b609-09cc7c8cf3bc", + "metadata": {}, + "outputs": [], + "source": [ + "means = df.groupby(\"bin_size\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3cfe11d-b77e-47e5-b23c-a4b290a9f6c9", + "metadata": {}, + "outputs": [], + "source": [ + "stds = df.groupby(\"bin_size\").std()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fec504f-1a18-4093-b366-5c79f6470a46", + "metadata": {}, + "outputs": [], + "source": [ + "plt.errorbar(\n", + " means[\"val_loss\"].keys(),\n", + " means[\"val_loss\"].values,\n", + " stds[\"val_loss\"].values,\n", + " marker=\"o\"\n", + ")\n", + "plt.ylim(2, 4)\n", + "plt.xlabel(\"bin size\")\n", + "plt.ylabel(\"validation loss\")\n", + "plt.xticks([32,64,128,256,512]);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73a7b3ed-9dbd-430e-8581-35730cb5e5c6", + "metadata": {}, + "outputs": [], + "source": [ + "plt.errorbar(\n", + " means[\"val_jet_matched_frac\"].keys(),\n", + " means[\"val_jet_matched_frac\"].values,\n", + " stds[\"val_jet_matched_frac\"].values,\n", + " marker=\"o\"\n", + ")\n", + "plt.ylim(0.8,1)\n", + "plt.xlabel(\"bin size\")\n", + "plt.ylabel(\"jet matched fraction\")\n", + "plt.xticks([32,64,128,256,512]);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "291fab34-ae80-4aa5-91fc-e7f4024606e4", + "metadata": {}, + "outputs": [], + "source": [ + "plt.errorbar(\n", + " means[\"val_met_iqr\"].keys(),\n", + " means[\"val_met_iqr\"].values,\n", + " stds[\"val_met_iqr\"].values,\n", + " marker=\"o\"\n", + ")\n", + "plt.ylim(0,0.5)\n", + "plt.xlabel(\"bin size\")\n", + "plt.ylabel(\"MET response IQR\")\n", + "plt.xticks([32,64,128,256,512]);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8726510-c82e-4340-9d35-a33a2119f793", + "metadata": {}, + "outputs": [], + "source": [ + "plt.errorbar(\n", + " means[\"val_jet_iqr\"].keys(),\n", + " means[\"val_jet_iqr\"].values,\n", + " stds[\"val_jet_iqr\"].values,\n", + " marker=\"o\"\n", + ")\n", + "plt.ylim(0,0.1)\n", + "plt.xlabel(\"bin size\")\n", + "plt.ylabel(\"jet response IQR\")\n", + "plt.xticks([32,64,128,256,512]);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a69788d9-4cb8-47a6-a768-3c85892f0b04", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/parameters/cms-transformer.yaml b/parameters/cms-transformer.yaml deleted file mode 100644 index 22149d175..000000000 --- a/parameters/cms-transformer.yaml +++ /dev/null @@ -1,280 +0,0 @@ -backend: tensorflow - -dataset: - schema: cms - target_particles: gen - num_input_features: 41 - num_output_features: 7 -# NONE = 0, -# TRACK = 1, -# PS1 = 2, -# PS2 = 3, -# ECAL = 4, -# HCAL = 5, -# GSF = 6, -# BREM = 7, -# HFEM = 8, -# HFHAD = 9, -# SC = 10, -# HO = 11, - num_input_classes: 12 - #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7) - num_output_classes: 8 - padded_num_elem_size: 6400 - #(pt, eta, sin phi, cos phi, E) - num_momentum_outputs: 5 - cls_weight_by_pt: no - reg_weight_by_pt: no - -loss: - classification_loss_coef: 100.0 - charge_loss_coef: 1.0 - pt_loss_coef: 1.0 - eta_loss_coef: 1.0 - sin_phi_loss_coef: 1.0 - cos_phi_loss_coef: 1.0 - energy_loss_coef: 1.0 - energy_loss: - type: MeanSquaredLogarithmicError - pt_loss: - type: MeanSquaredLogarithmicError - sin_phi_loss: - type: Huber - delta: 0.1 - cos_phi_loss: - type: Huber - delta: 0.1 - eta_loss: - type: Huber - delta: 0.1 - event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d - event_loss_coef: 1.0 - met_loss: none - met_loss_coef: 1.0 - -tensorflow: - eager: no - -setup: - train: yes - weights: - weights_config: - lr: 0.00001 - num_events_validation: 500 - num_epochs: 50 - dtype: float32 - trainable: - classification_loss_type: sigmoid_focal_crossentropy - lr_schedule: none # cosinedecay, exponentialdecay, onecycle, none - optimizer: adam # adam, adamw, sgd - horovod_enabled: no - cls_output_as_logits: yes - small_graph_opt: yes - -batching: - # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: yes - # these sizes were sort of tuned for an 8GB GPU - # - max_sequence_length, batch_size_per_gpu - -#on 8GB GPU - bucket_batch_sizes: - - 25, 160 - - 50, 80 - - 100, 40 - - 200, 20 - - 500, 10 - - 1000, 5 - - 2000, 3 - - 3000, 2 - - 4000, 2 - - 5000, 1 - - 6000, 1 - - inf, 1 - # use this batch multiplier to increase all batch sizes by a constant factor - batch_multiplier: 1 - -optimizer: - adam: - amsgrad: no - adamw: - amsgrad: yes - weight_decay: 0.001 - sgd: - nesterov: no - momentum: 0.9 - -# LR Schedules -exponentialdecay: - decay_steps: 2000 - decay_rate: 0.99 - staircase: yes -onecycle: - mom_min: 0.85 - mom_max: 0.95 - warmup_ratio: 0.3 - div_factor: 25.0 - final_div: 100000.0 - -parameters: - model: transformer - input_encoding: cms - output_decoding: - activation: elu - regression_use_classification: yes - dropout: 0.0 - - pt_as_correction: yes - - id_dim_decrease: yes - charge_dim_decrease: yes - pt_dim_decrease: yes - eta_dim_decrease: yes - phi_dim_decrease: yes - energy_dim_decrease: yes - - id_hidden_dim: 512 - charge_hidden_dim: 256 - pt_hidden_dim: 512 - eta_hidden_dim: 256 - phi_hidden_dim: 256 - energy_hidden_dim: 512 - - id_num_layers: 2 - charge_num_layers: 2 - pt_num_layers: 2 - eta_num_layers: 2 - phi_num_layers: 2 - energy_num_layers: 2 - layernorm: yes - mask_reg_cls0: no - -timing: - num_ev: 100 - num_iter: 3 - -callbacks: - checkpoint: - monitor: "val_loss" - plot_freq: 1 - tensorboard: - dump_history: yes - hist_freq: 1 - -hypertune: - algorithm: hyperband # random, bayesian, hyperband - random: - objective: val_loss - max_trials: 100 - bayesian: - objective: val_loss - max_trials: 100 - num_initial_points: 2 - hyperband: - objective: val_loss - max_epochs: 10 - factor: 3 - iterations: 1 - executions_per_trial: 1 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_test_datasets: - physical: - batch_per_gpu: 1 - datasets: - - cms_pf_ttbar - - cms_pf_ztt - - cms_pf_qcd - - cms_pf_qcd_high_pt - gun: - batch_per_gpu: 50 - datasets: - - cms_pf_single_electron - - cms_pf_single_gamma - - cms_pf_single_neutron - - cms_pf_single_pi0 - - cms_pf_single_pi - - cms_pf_single_tau - - cms_pf_single_mu - - cms_pf_single_proton - -evaluation_datasets: - cms_pf_qcd_high_pt: - batch_size: 5 - num_events: -1 - cms_pf_single_neutron: - batch_size: 100 - num_events: -1 - -validation_dataset: cms_pf_qcd_high_pt -validation_batch_size: 5 -validation_num_events: 500 - -datasets: - cms_pf_ttbar: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_ztt: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_qcd: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_qcd_high_pt: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_electron: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_gamma: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_pi0: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_neutron: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_pi: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_tau: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_mu: - version: 1.5.1 - data_dir: - manual_dir: - cms_pf_single_proton: - version: 1.5.1 - data_dir: - manual_dir: diff --git a/parameters/pyg-cms-physical.yaml b/parameters/pyg-cms-physical.yaml deleted file mode 100644 index c40f392e3..000000000 --- a/parameters/pyg-cms-physical.yaml +++ /dev/null @@ -1,125 +0,0 @@ -backend: pytorch - -dataset: cms -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 2 -patience: 20 -lr: 0.0001 -lr_schedule: constant # constant, cosinedecay, onecycle -conv_type: gnn_lsh -ntrain: -ntest: -nvalid: 500 -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 100 - -model: - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 640 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - - gravnet: - conv_type: gravnet - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gravnet specific parameters - k: 16 - propagate_dimensions: 32 - space_dimensions: 4 - - attention: - conv_type: attention - embedding_dim: 256 - width: 256 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # attention specific paramters - num_heads: 2 - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: hyperopt # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_ttbar: - version: 1.6.0 - cms_pf_qcd: - version: 1.6.0 - cms_pf_ztt: - version: 1.6.0 - cms_pf_qcd_high_pt: - version: 1.6.0 - cms_pf_sms_t1tttt: - version: 1.6.0 - -valid_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_qcd_high_pt: - version: 1.6.0 - -test_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_qcd_high_pt: - version: 1.6.0 diff --git a/parameters/pyg-cms-small.yaml b/parameters/pyg-cms-small.yaml deleted file mode 100644 index d7677f906..000000000 --- a/parameters/pyg-cms-small.yaml +++ /dev/null @@ -1,119 +0,0 @@ -backend: pytorch - -dataset: cms -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 10 -patience: 20 -lr: 0.0001 -lr_schedule: constant # constant, cosinedecay, onecycle -conv_type: gravnet -ntrain: 500 -ntest: 500 -nvalid: 500 -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 10 - -model: - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 640 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - - gravnet: - conv_type: gravnet - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gravnet specific parameters - k: 16 - propagate_dimensions: 32 - space_dimensions: 4 - - attention: - conv_type: attention - embedding_dim: 256 - width: 256 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # attention specific paramters - num_heads: 2 - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: hyperopt # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -train_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_ttbar: - version: 1.6.0 - cms_pf_qcd: - version: 1.6.0 - -valid_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_qcd_high_pt: - version: 1.6.0 - -test_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_qcd_high_pt: - version: 1.6.0 diff --git a/parameters/pyg-cms-test-qcdhighpt.yaml b/parameters/pyg-cms-test-qcdhighpt.yaml deleted file mode 100644 index ae8eb5680..000000000 --- a/parameters/pyg-cms-test-qcdhighpt.yaml +++ /dev/null @@ -1,101 +0,0 @@ -backend: pytorch - -dataset: cms -data_dir: -gpus: 1 -gpu_batch_multiplier: 1 -load: -num_epochs: 2 -patience: 20 -lr: 0.0001 -lr_schedule: constant # constant, cosinedecay, onecycle -conv_type: gnn_lsh -ntrain: -ntest: -nvalid: 500 -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 10 - -model: - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 640 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - - gravnet: - conv_type: gravnet - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gravnet specific parameters - k: 16 - propagate_dimensions: 32 - space_dimensions: 4 - - attention: - conv_type: attention - embedding_dim: 256 - width: 256 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # attention specific paramters - num_heads: 2 - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: hyperopt # bayes, bohb, hyperopt, nevergrad, scikit - default_metric: "val_loss" - default_mode: "min" - # Tune schedule specific parameters - asha: - max_t: 200 - reduction_factor: 4 - brackets: 1 - grace_period: 10 - hyperband: - max_t: 200 - reduction_factor: 4 - hyperopt: - n_random_steps: 10 - nevergrad: - n_random_steps: 10 - -test_dataset: - cms: - physical: - batch_size: 1 - samples: - cms_pf_qcd_high_pt: - version: 1.6.0 diff --git a/parameters/pyg-workflow-test.yaml b/parameters/pyg-workflow-test.yaml deleted file mode 100644 index 197c7c96c..000000000 --- a/parameters/pyg-workflow-test.yaml +++ /dev/null @@ -1,97 +0,0 @@ -backend: pytorch - -dataset: cms -data_dir: -gpus: -gpu_batch_multiplier: 1 -load: -num_epochs: 2 -patience: 20 -lr: 0.0001 -lr_schedule: constant # constant, cosinedecay, onecycle -conv_type: gravnet -ntrain: -ntest: -nvalid: 500 -num_workers: 0 -prefetch_factor: -checkpoint_freq: -comet_name: particleflow-pt -comet_offline: False -comet_step_freq: 10 - -model: - gnn_lsh: - conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gnn-lsh specific parameters - bin_size: 640 - max_num_bins: 200 - distance_dim: 128 - layernorm: True - num_node_messages: 2 - ffn_dist_hidden_dim: 128 - - gravnet: - conv_type: gravnet - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # gravnet specific parameters - k: 16 - propagate_dimensions: 32 - space_dimensions: 4 - - attention: - conv_type: attention - embedding_dim: 256 - width: 256 - num_convs: 3 - dropout: 0.0 - activation: "elu" - # attention specific paramters - num_heads: 2 - - mamba: - conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 - activation: "elu" - # transformer specific paramters - num_heads: 2 - # mamba specific paramters - d_state: 16 - d_conv: 4 - expand: 2 - -train_dataset: - cms: - physical: - batch_size: 2 - samples: - cms_pf_ttbar: - version: 1.6.0 - -valid_dataset: - cms: - physical: - batch_size: 2 - samples: - cms_pf_ttbar: - version: 1.6.0 - -test_dataset: - cms: - physical: - batch_size: 2 - samples: - cms_pf_ttbar: - version: 1.6.0 diff --git a/parameters/pyg-cms-small-highqcd.yaml b/parameters/pytorch/pyg-clic-hits.yaml similarity index 77% rename from parameters/pyg-cms-small-highqcd.yaml rename to parameters/pytorch/pyg-clic-hits.yaml index c6f1c3af8..a999f46a3 100644 --- a/parameters/pyg-cms-small-highqcd.yaml +++ b/parameters/pytorch/pyg-clic-hits.yaml @@ -1,13 +1,13 @@ backend: pytorch -dataset: cms +dataset: clic data_dir: gpus: 1 gpu_batch_multiplier: 1 load: -num_epochs: 2 +num_epochs: 20 patience: 20 -lr: 0.0001 +lr: 0.001 lr_schedule: constant # constant, cosinedecay, onecycle conv_type: gnn_lsh ntrain: @@ -25,15 +25,15 @@ model: conv_type: gnn_lsh embedding_dim: 512 width: 512 - num_convs: 3 + num_convs: 2 dropout: 0.0 activation: "elu" # gnn-lsh specific parameters - bin_size: 640 + bin_size: 256 max_num_bins: 200 distance_dim: 128 layernorm: True - num_node_messages: 2 + num_node_messages: 1 ffn_dist_hidden_dim: 128 gravnet: @@ -60,17 +60,15 @@ model: mamba: conv_type: mamba - embedding_dim: 128 - width: 128 + embedding_dim: 512 + width: 512 num_convs: 2 dropout: 0.0 activation: "elu" - # transformer specific paramters - num_heads: 2 # mamba specific paramters - d_state: 16 + d_state: 64 d_conv: 4 - expand: 2 + expand: 4 raytune: local_dir: # Note: please specify an absolute path @@ -93,29 +91,29 @@ raytune: n_random_steps: 10 train_dataset: - cms: + clic_hits: physical: batch_size: 1 samples: - cms_pf_ttbar: - version: 1.6.0 - cms_pf_qcd: - version: 1.6.0 - cms_pf_qcd_high_pt: - version: 1.6.0 + clic_edm_qq_hits_pf10k: + version: 1.7.0 + clic_edm_ttbar_hits_pf10k: + version: 1.7.0 valid_dataset: - cms: + clic_hits: physical: batch_size: 1 samples: - cms_pf_qcd_high_pt: - version: 1.6.0 + clic_edm_ttbar_hits_pf10k: + version: 1.7.0 test_dataset: - cms: + clic_hits: physical: batch_size: 1 samples: - cms_pf_qcd_high_pt: - version: 1.6.0 + clic_edm_qq_hits_pf10k: + version: 1.7.0 + clic_edm_ttbar_hits_pf10k: + version: 1.7.0 diff --git a/parameters/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml similarity index 99% rename from parameters/pyg-clic.yaml rename to parameters/pytorch/pyg-clic.yaml index e2eb3598a..544f3cc66 100644 --- a/parameters/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -1,6 +1,7 @@ backend: pytorch dataset: clic +sort_data: no data_dir: gpus: 1 gpu_batch_multiplier: 1 diff --git a/parameters/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml similarity index 60% rename from parameters/pyg-cms.yaml rename to parameters/pytorch/pyg-cms.yaml index f9573c483..ce17e6ca1 100644 --- a/parameters/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -1,13 +1,14 @@ backend: pytorch dataset: cms +sort_data: yes data_dir: gpus: 1 gpu_batch_multiplier: 1 load: num_epochs: 50 patience: 20 -lr: 0.0001 +lr: 0.001 lr_schedule: cosinedecay # constant, cosinedecay, onecycle conv_type: gnn_lsh ntrain: @@ -23,10 +24,10 @@ comet_step_freq: 10 model: gnn_lsh: conv_type: gnn_lsh - embedding_dim: 512 - width: 512 - num_convs: 3 - dropout: 0.0 + embedding_dim: 1024 + width: 1024 + num_convs: 5 + dropout: 0.1 activation: "elu" # gnn-lsh specific parameters bin_size: 640 @@ -60,13 +61,13 @@ model: mamba: conv_type: mamba - embedding_dim: 128 - width: 128 - num_convs: 2 - dropout: 0.0 + embedding_dim: 1024 + width: 1024 + num_convs: 4 + dropout: 0.1 activation: "elu" # mamba specific paramters - d_state: 16 + d_state: 32 d_conv: 4 expand: 2 @@ -96,39 +97,39 @@ train_dataset: batch_size: 1 samples: cms_pf_ttbar: - version: 1.6.0 + version: 1.7.0 cms_pf_qcd: - version: 1.6.0 + version: 1.7.0 cms_pf_ztt: - version: 1.6.0 + version: 1.7.0 cms_pf_qcd_high_pt: - version: 1.6.0 + version: 1.7.0 cms_pf_sms_t1tttt: - version: 1.6.0 + version: 1.7.0 gun: batch_size: 20 samples: cms_pf_single_electron: - version: 1.6.0 + version: 1.7.0 cms_pf_single_gamma: - version: 1.6.0 + version: 1.7.0 cms_pf_single_pi0: - version: 1.6.0 + version: 1.7.0 cms_pf_single_neutron: - version: 1.6.0 + version: 1.7.0 cms_pf_single_pi: - version: 1.6.0 + version: 1.7.0 cms_pf_single_tau: - version: 1.6.0 + version: 1.7.0 cms_pf_single_mu: - version: 1.6.0 + version: 1.7.0 cms_pf_single_proton: - version: 1.6.0 + version: 1.7.0 multiparticlegun: batch_size: 4 samples: cms_pf_multi_particle_gun: - version: 1.6.0 + version: 1.7.0 valid_dataset: cms: @@ -136,39 +137,41 @@ valid_dataset: batch_size: 1 samples: cms_pf_ttbar: - version: 1.6.0 + version: 1.7.0 cms_pf_qcd: - version: 1.6.0 + version: 1.7.0 cms_pf_ztt: - version: 1.6.0 + version: 1.7.0 cms_pf_qcd_high_pt: - version: 1.6.0 + version: 1.7.0 cms_pf_sms_t1tttt: - version: 1.6.0 + version: 1.7.0 + cms_pf_vbf: + version: 1.7.0 gun: batch_size: 20 samples: cms_pf_single_electron: - version: 1.6.0 + version: 1.7.0 cms_pf_single_gamma: - version: 1.6.0 + version: 1.7.0 cms_pf_single_pi0: - version: 1.6.0 + version: 1.7.0 cms_pf_single_neutron: - version: 1.6.0 + version: 1.7.0 cms_pf_single_pi: - version: 1.6.0 + version: 1.7.0 cms_pf_single_tau: - version: 1.6.0 + version: 1.7.0 cms_pf_single_mu: - version: 1.6.0 + version: 1.7.0 cms_pf_single_proton: - version: 1.6.0 + version: 1.7.0 multiparticlegun: batch_size: 4 samples: cms_pf_multi_particle_gun: - version: 1.6.0 + version: 1.7.0 test_dataset: @@ -176,38 +179,7 @@ test_dataset: physical: batch_size: 1 samples: - # cms_pf_ttbar: - # version: 1.6.0 - # cms_pf_qcd: - # version: 1.6.0 - # cms_pf_ztt: - # version: 1.6.0 + cms_pf_ttbar: + version: 1.7.0 cms_pf_qcd_high_pt: - version: 1.6.0 - # cms_pf_sms_t1tttt: - # version: 1.6.0 - # gun: - # batch_size: 20 - # samples: - # cms_pf_single_electron: - # version: 1.6.0 - # cms_pf_single_gamma: - # version: 1.6.0 - # batch_size: 20 - # cms_pf_single_pi0: - # version: 1.6.0 - # cms_pf_single_neutron: - # version: 1.6.0 - # cms_pf_single_pi: - # version: 1.6.0 - # cms_pf_single_tau: - # version: 1.6.0 - # cms_pf_single_mu: - # version: 1.6.0 - # cms_pf_single_proton: - # version: 1.6.0 - # multiparticlegun: - # batch_size: 4 - # samples: - # cms_pf_multi_particle_gun: - # version: 1.6.0 + version: 1.7.0 diff --git a/parameters/pyg-delphes.yaml b/parameters/pytorch/pyg-delphes.yaml similarity index 99% rename from parameters/pyg-delphes.yaml rename to parameters/pytorch/pyg-delphes.yaml index cdd19ca0b..a4299cd78 100644 --- a/parameters/pyg-delphes.yaml +++ b/parameters/pytorch/pyg-delphes.yaml @@ -2,6 +2,7 @@ backend: pytorch dataset: delphes data_dir: +sort_data: no gpus: 1 gpu_batch_multiplier: 1 load: diff --git a/parameters/bench/clic-hits-bench.yaml b/parameters/tensorflow/bench/clic-hits-bench.yaml similarity index 96% rename from parameters/bench/clic-hits-bench.yaml rename to parameters/tensorflow/bench/clic-hits-bench.yaml index 0a7eb8ae6..9942f7840 100644 --- a/parameters/bench/clic-hits-bench.yaml +++ b/parameters/tensorflow/bench/clic-hits-bench.yaml @@ -51,7 +51,7 @@ setup: train: yes weights: weights_config: - lr: 0.0005 + lr: 0.001 num_epochs: 20 dtype: float32 trainable: @@ -60,11 +60,11 @@ setup: horovod_enabled: no cls_output_as_logits: yes small_graph_opt: no - use_normalizer: yes + use_normalizer: no batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: yes + bucket_by_sequence_length: no bucket_batch_sizes: auto batch_multiplier: 1 @@ -95,7 +95,7 @@ parameters: input_encoding: clic node_update_mode: additive do_node_encoding: yes - node_encoding_hidden_dim: 512 + node_encoding_hidden_dim: 256 combined_graph_layer: bin_size: 256 @@ -129,7 +129,7 @@ parameters: num_node_messages: 2 node_message: type: GHConvDense - output_dim: 512 + output_dim: 256 activation: elu #if this is enabled, it will break float16 training normalize_degrees: no @@ -221,7 +221,7 @@ raytune: train_test_datasets: physical: batch_per_gpu: 1 - event_pad_size: 15360 + event_pad_size: -1 datasets: - clic_edm_ttbar_hits_pf10k - clic_edm_qq_hits_pf10k @@ -239,10 +239,10 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_hits_pf10k: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_qq_hits_pf10k: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: diff --git a/parameters/bench/delphes-bench.yaml b/parameters/tensorflow/bench/delphes-bench.yaml similarity index 100% rename from parameters/bench/delphes-bench.yaml rename to parameters/tensorflow/bench/delphes-bench.yaml diff --git a/parameters/clic-hits.yaml b/parameters/tensorflow/clic-hits.yaml similarity index 88% rename from parameters/clic-hits.yaml rename to parameters/tensorflow/clic-hits.yaml index a299d587f..88c4ebe9e 100644 --- a/parameters/clic-hits.yaml +++ b/parameters/tensorflow/clic-hits.yaml @@ -5,7 +5,7 @@ cache: caches/clic_hits dataset: schema: clic target_particles: gen - num_input_features: 15 + num_input_features: 16 #(none=0, track=1, hit=2) num_input_classes: 3 #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) @@ -51,7 +51,7 @@ setup: train: yes weights: weights_config: - lr: 0.0005 + lr: 0.001 num_epochs: 20 dtype: float32 trainable: @@ -60,13 +60,12 @@ setup: horovod_enabled: no cls_output_as_logits: yes small_graph_opt: no - use_normalizer: yes + use_normalizer: no batching: # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu - bucket_by_sequence_length: yes + bucket_by_sequence_length: no bucket_batch_sizes: auto - # use this batch multiplier to increase all batch sizes by a constant factor batch_multiplier: 1 optimizer: @@ -103,10 +102,10 @@ parameters: max_num_bins: 200 distance_dim: 128 layernorm: yes - dropout: 0.0 + dropout: 0.1 dist_activation: elu - ffn_dist_num_layers: 2 - ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 1 + ffn_dist_hidden_dim: 256 # MPNN #kernel: @@ -127,24 +126,22 @@ parameters: dist_mult: 0.1 clip_value_low: 0.0 dist_norm: l2 - num_node_messages: 2 + num_node_messages: 1 node_message: type: GHConvDense output_dim: 512 activation: elu #if this is enabled, it will break float16 training - normalize_degrees: no + normalize_degrees: yes activation: elu - num_graph_layers_id: 6 - num_graph_layers_reg: 6 + num_graph_layers_id: 1 + num_graph_layers_reg: 3 output_decoding: activation: elu regression_use_classification: yes dropout: 0.1 - pt_as_correction: no - id_dim_decrease: yes charge_dim_decrease: yes eta_dim_decrease: yes @@ -163,10 +160,10 @@ parameters: eta_num_layers: 2 phi_num_layers: 2 energy_num_layers: 2 - layernorm: yes - mask_reg_cls0: yes + layernorm: no + mask_reg_cls0: no - skip_connection: no + skip_connection: yes debug: no timing: @@ -220,15 +217,16 @@ raytune: train_test_datasets: physical: batch_per_gpu: 1 - event_pad_size: 15360 + event_pad_size: -1 datasets: - clic_edm_ttbar_hits_pf - clic_edm_qq_hits_pf gun: - batch_per_gpu: 5 + batch_per_gpu: 1 event_pad_size: 3840 datasets: - clic_edm_single_kaon0l_hits_pf + - clic_edm_single_gamma_hits_pf - clic_edm_single_pi_hits_pf - clic_edm_single_pi0_hits_pf - clic_edm_single_neutron_hits_pf @@ -248,38 +246,38 @@ evaluation_jet_algo: ee_genkt_algorithm datasets: clic_edm_ttbar_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_qq_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_kaon0l_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_gamma_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_pi_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_pi0_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_neutron_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_electron_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: clic_edm_single_muon_hits_pf: - version: 1.5.0 + version: 1.7.0 data_dir: manual_dir: diff --git a/parameters/clic.yaml b/parameters/tensorflow/clic.yaml similarity index 98% rename from parameters/clic.yaml rename to parameters/tensorflow/clic.yaml index 5c1e2e5ee..90901f3f2 100644 --- a/parameters/clic.yaml +++ b/parameters/tensorflow/clic.yaml @@ -133,8 +133,10 @@ parameters: id_dim_decrease: yes charge_dim_decrease: yes + pt_dim_decrease: yes eta_dim_decrease: yes phi_dim_decrease: yes + energy_dim_decrease: yes id_hidden_dim: 128 charge_hidden_dim: 128 diff --git a/parameters/tensorflow/clic_studies/binsize/clic_bin_size_128.yaml b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_128.yaml new file mode 100644 index 000000000..3dc28c35e --- /dev/null +++ b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_128.yaml @@ -0,0 +1,260 @@ +backend: tensorflow + +cache: caches/clic + +dataset: + schema: clic + target_particles: gen + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + enable_tfds_caching: no + +loss: + classification_loss_coef: 100.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.001 + num_epochs: 50 + dtype: float32 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + use_normalizer: no + +batching: + # if enabled, use variable-size batching instead of the fixed-size batches configured per-sample in batch_per_gpu + bucket_by_sequence_length: no + bucket_batch_sizes: auto + batch_multiplier: 1.0 + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 + + combined_graph_layer: + bin_size: 128 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.0 + dist_activation: elu + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 256 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.1 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + pt_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 50 + event_pad_size: -1 + datasets: + - clic_edm_ttbar_pf + - clic_edm_qq_pf + +do_validation_callback: yes +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_qq_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ww_fullhad_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_zh_tautau_pf: + version: 1.5.0 + data_dir: + manual_dir: diff --git a/parameters/tensorflow/clic_studies/binsize/clic_bin_size_256.yaml b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_256.yaml new file mode 100644 index 000000000..443485220 --- /dev/null +++ b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_256.yaml @@ -0,0 +1,260 @@ +backend: tensorflow + +cache: caches/clic + +dataset: + schema: clic + target_particles: gen + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + enable_tfds_caching: no + +loss: + classification_loss_coef: 100.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.001 + num_epochs: 50 + dtype: float32 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + use_normalizer: no + +batching: + # if enabled, use variable-size batching instead of the fixed-size batches configured per-sample in batch_per_gpu + bucket_by_sequence_length: no + bucket_batch_sizes: auto + batch_multiplier: 1.0 + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 + + combined_graph_layer: + bin_size: 256 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.0 + dist_activation: elu + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 256 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.1 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + pt_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 50 + event_pad_size: -1 + datasets: + - clic_edm_ttbar_pf + - clic_edm_qq_pf + +do_validation_callback: yes +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_qq_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ww_fullhad_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_zh_tautau_pf: + version: 1.5.0 + data_dir: + manual_dir: diff --git a/parameters/tensorflow/clic_studies/binsize/clic_bin_size_32.yaml b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_32.yaml new file mode 100644 index 000000000..5fc10d5ad --- /dev/null +++ b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_32.yaml @@ -0,0 +1,260 @@ +backend: tensorflow + +cache: caches/clic + +dataset: + schema: clic + target_particles: gen + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + enable_tfds_caching: no + +loss: + classification_loss_coef: 100.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.001 + num_epochs: 50 + dtype: float32 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + use_normalizer: no + +batching: + # if enabled, use variable-size batching instead of the fixed-size batches configured per-sample in batch_per_gpu + bucket_by_sequence_length: no + bucket_batch_sizes: auto + batch_multiplier: 1.0 + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 + + combined_graph_layer: + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.0 + dist_activation: elu + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 256 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.1 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + pt_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 50 + event_pad_size: -1 + datasets: + - clic_edm_ttbar_pf + - clic_edm_qq_pf + +do_validation_callback: yes +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_qq_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ww_fullhad_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_zh_tautau_pf: + version: 1.5.0 + data_dir: + manual_dir: diff --git a/parameters/clic-test.yaml b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_512.yaml similarity index 80% rename from parameters/clic-test.yaml rename to parameters/tensorflow/clic_studies/binsize/clic_bin_size_512.yaml index d4321174d..072072a9d 100644 --- a/parameters/clic-test.yaml +++ b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_512.yaml @@ -1,6 +1,6 @@ backend: tensorflow -cache: caches/clic_test +cache: caches/clic dataset: schema: clic @@ -12,10 +12,10 @@ dataset: num_output_classes: 6 cls_weight_by_pt: no reg_weight_by_pt: no - enable_tfds_caching: yes + enable_tfds_caching: no loss: - classification_loss_coef: 200.0 + classification_loss_coef: 100.0 charge_loss_coef: 1.0 pt_loss_coef: 10.0 eta_loss_coef: 10.0 @@ -51,12 +51,12 @@ setup: train: yes weights: weights_config: - lr: 0.0005 - num_epochs: 11 + lr: 0.001 + num_epochs: 50 dtype: float32 trainable: lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none - optimizer: adam # adam, adamw, sgd + optimizer: adam # adam, adamw, sgd horovod_enabled: no cls_output_as_logits: yes small_graph_opt: yes @@ -66,7 +66,7 @@ batching: # if enabled, use variable-size batching instead of the fixed-size batches configured per-sample in batch_per_gpu bucket_by_sequence_length: no bucket_batch_sizes: auto - batch_multiplier: 1 + batch_multiplier: 1.0 optimizer: adam: @@ -98,7 +98,7 @@ parameters: node_encoding_hidden_dim: 256 combined_graph_layer: - bin_size: 256 + bin_size: 512 max_num_bins: 200 distance_dim: 128 layernorm: yes @@ -133,9 +133,9 @@ parameters: id_dim_decrease: yes charge_dim_decrease: yes - pt_dim_decrease: yes eta_dim_decrease: yes phi_dim_decrease: yes + pt_dim_decrease: yes energy_dim_decrease: yes id_hidden_dim: 128 @@ -170,7 +170,7 @@ callbacks: hist_freq: 1 hypertune: - algorithm: hyperband # random, bayesian, hyperband + algorithm: hyperband # random, bayesian, hyperband random: objective: val_loss max_trials: 100 @@ -186,9 +186,9 @@ hypertune: executions_per_trial: 1 raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband - search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit default_metric: "val_loss" default_mode: "min" # Tune schedule specific parameters @@ -208,20 +208,32 @@ raytune: train_test_datasets: physical: batch_per_gpu: 50 - event_pad_size: 512 + event_pad_size: -1 datasets: - clic_edm_ttbar_pf + - clic_edm_qq_pf -do_checkpoint_callback: no -do_validation_callback: no +do_validation_callback: yes validation_dataset: clic_edm_ttbar_pf validation_batch_size: 100 validation_num_events: 2000 evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 clic_edm_ttbar_pf: batch_size: 50 num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 evaluation_jet_algo: ee_genkt_algorithm @@ -230,3 +242,19 @@ datasets: version: 1.5.0 data_dir: manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_qq_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ww_fullhad_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_zh_tautau_pf: + version: 1.5.0 + data_dir: + manual_dir: diff --git a/parameters/tensorflow/clic_studies/binsize/clic_bin_size_64.yaml b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_64.yaml new file mode 100644 index 000000000..225634cc5 --- /dev/null +++ b/parameters/tensorflow/clic_studies/binsize/clic_bin_size_64.yaml @@ -0,0 +1,260 @@ +backend: tensorflow + +cache: caches/clic + +dataset: + schema: clic + target_particles: gen + num_input_features: 17 + #(none=0, track=1, cluster=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + enable_tfds_caching: no + +loss: + classification_loss_coef: 100.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.001 + num_epochs: 50 + dtype: float32 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + use_normalizer: no + +batching: + # if enabled, use variable-size batching instead of the fixed-size batches configured per-sample in batch_per_gpu + bucket_by_sequence_length: no + bucket_batch_sizes: auto + batch_multiplier: 1.0 + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 256 + + combined_graph_layer: + bin_size: 64 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.0 + dist_activation: elu + ffn_dist_num_layers: 3 + ffn_dist_hidden_dim: 64 + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 256 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.1 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + pt_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 128 + charge_hidden_dim: 128 + pt_hidden_dim: 128 + eta_hidden_dim: 128 + phi_hidden_dim: 128 + energy_hidden_dim: 128 + + id_num_layers: 1 + charge_num_layers: 1 + pt_num_layers: 1 + eta_num_layers: 1 + phi_num_layers: 1 + energy_num_layers: 1 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 50 + event_pad_size: -1 + datasets: + - clic_edm_ttbar_pf + - clic_edm_qq_pf + +do_validation_callback: yes +validation_dataset: clic_edm_ttbar_pf +validation_batch_size: 100 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_qq_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pf: + batch_size: 50 + num_events: -1 + clic_edm_ttbar_pu10_pf: + batch_size: 50 + num_events: -1 + clic_edm_zh_tautau_pf: + batch_size: 50 + num_events: -1 + clic_edm_ww_fullhad_pf: + batch_size: 50 + num_events: -1 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ttbar_pu10_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_qq_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_ww_fullhad_pf: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_zh_tautau_pf: + version: 1.5.0 + data_dir: + manual_dir: diff --git a/parameters/cms-gen.yaml b/parameters/tensorflow/cms.yaml similarity index 90% rename from parameters/cms-gen.yaml rename to parameters/tensorflow/cms.yaml index 05de2aba9..fc201511e 100644 --- a/parameters/cms-gen.yaml +++ b/parameters/tensorflow/cms.yaml @@ -5,7 +5,7 @@ cache: caches/cms_gen dataset: schema: cms target_particles: gen - num_input_features: 42 + num_input_features: 55 # NONE = 0, # TRACK = 1, # PS1 = 2, @@ -66,8 +66,8 @@ setup: train: yes weights: weights_config: - lr: 0.0002 - num_epochs: 40 + lr: 0.0005 + num_epochs: 10 dtype: float32 trainable: lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none @@ -76,7 +76,7 @@ setup: cls_output_as_logits: yes #if enabled, do not create LSH bins for small graphs (less than one bin size) #enabling results in some speedup for gun samples, but must be disabled for XLA - small_graph_opt: yes + small_graph_opt: no use_normalizer: no batching: @@ -111,7 +111,7 @@ parameters: input_encoding: cms node_update_mode: additive do_node_encoding: yes - node_encoding_hidden_dim: 1024 + node_encoding_hidden_dim: 512 combined_graph_layer: bin_size: 320 @@ -132,7 +132,7 @@ parameters: num_node_messages: 2 node_message: type: GHConvDense - output_dim: 1024 + output_dim: 512 activation: elu normalize_degrees: yes activation: elu @@ -231,6 +231,7 @@ train_test_datasets: - cms_pf_qcd - cms_pf_qcd_high_pt - cms_pf_sms_t1tttt + - cms_pf_vbf gun: batch_per_gpu: 50 event_pad_size: -1 @@ -245,11 +246,17 @@ train_test_datasets: - cms_pf_single_proton evaluation_datasets: + cms_pf_ttbar: + batch_size: 5 + num_events: 5000 cms_pf_qcd_high_pt: batch_size: 5 num_events: 5000 - cms_pf_single_neutron: - batch_size: 100 + cms_pf_ztt: + batch_size: 5 + num_events: 5000 + cms_pf_qcd: + batch_size: 5 num_events: 5000 validation_dataset: cms_pf_qcd_high_pt @@ -260,58 +267,62 @@ evaluation_jet_algo: antikt_algorithm datasets: cms_pf_ttbar: - version: 1.6.0 + version: 1.7.0 + data_dir: + manual_dir: + cms_pf_vbf: + version: 1.7.0 data_dir: manual_dir: cms_pf_ztt: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_qcd: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_qcd_high_pt: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_electron: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_gamma: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_pi0: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_neutron: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_pi: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_tau: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_mu: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_single_proton: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: cms_pf_multi_particle_gun: - version: 1.6.1 + version: 1.7.0 data_dir: manual_dir: cms_pf_sms_t1tttt: - version: 1.6.0 + version: 1.7.0 data_dir: manual_dir: diff --git a/parameters/delphes.yaml b/parameters/tensorflow/delphes.yaml similarity index 100% rename from parameters/delphes.yaml rename to parameters/tensorflow/delphes.yaml diff --git a/scripts/fcc/clicRec_e4h_input.py b/scripts/fcc/clicRec_e4h_input.py index 2d8fccecf..818b3ec50 100644 --- a/scripts/fcc/clicRec_e4h_input.py +++ b/scripts/fcc/clicRec_e4h_input.py @@ -61,7 +61,7 @@ "BeamCalCollection", "BeamCalCollectionContributions", ] -inp.OutputLevel = DEBUG +inp.OutputLevel = INFO MyAIDAProcessor = MarlinProcessorWrapper("MyAIDAProcessor") @@ -72,7 +72,7 @@ # EDM4hep to LCIO converter edmConvTool = EDM4hep2LcioTool("EDM4hep2lcio") edmConvTool.convertAll = True -edmConvTool.OutputLevel = DEBUG +edmConvTool.OutputLevel = INFO MyAIDAProcessor.EDM4hep2LcioTool = edmConvTool @@ -132,7 +132,7 @@ "VXDTrackerHits": "VXDTrackerHits", "VXDTrackerHitRelations": "VXDTrackerHitRelations", } -VXDBarrelDigitiserLCIOConv.OutputLevel = DEBUG +VXDBarrelDigitiserLCIOConv.OutputLevel = INFO # Add it to VXDBarrelDigitiser Algorithm VXDBarrelDigitiser.Lcio2EDM4hepTool = VXDBarrelDigitiserLCIOConv @@ -158,7 +158,7 @@ "VXDEndcapTrackerHits": "VXDEndcapTrackerHits", "VXDEndcapTrackerHitRelations": "VXDEndcapTrackerHitRelations", } -VXDEndcapDigitiserLCIOConv.OutputLevel = DEBUG +VXDEndcapDigitiserLCIOConv.OutputLevel = INFO # Add it to VXDEndcapDigitiser Algorithm VXDEndcapDigitiser.Lcio2EDM4hepTool = VXDEndcapDigitiserLCIOConv @@ -184,7 +184,7 @@ "ITrackerHits": "ITrackerHits", "InnerTrackerBarrelHitsRelations": "InnerTrackerBarrelHitsRelations", } -InnerPlanarDigiProcessorLCIOConv.OutputLevel = DEBUG +InnerPlanarDigiProcessorLCIOConv.OutputLevel = INFO # Add it to InnerPlanarDigiProcessor Algorithm InnerPlanarDigiProcessor.Lcio2EDM4hepTool = InnerPlanarDigiProcessorLCIOConv @@ -209,7 +209,7 @@ "ITrackerEndcapHits": "ITrackerEndcapHits", "InnerTrackerEndcapHitsRelations": "InnerTrackerEndcapHitsRelations", } -InnerEndcapPlanarDigiProcessorLCIOConv.OutputLevel = DEBUG +InnerEndcapPlanarDigiProcessorLCIOConv.OutputLevel = INFO # Add it to InnerEndcapPlanarDigiProcessor Algorithm InnerEndcapPlanarDigiProcessor.Lcio2EDM4hepTool = InnerEndcapPlanarDigiProcessorLCIOConv @@ -234,7 +234,7 @@ "OTrackerHits": "OTrackerHits", "OuterTrackerBarrelHitsRelations": "OuterTrackerBarrelHitsRelations", } -OuterPlanarDigiProcessorLCIOConv.OutputLevel = DEBUG +OuterPlanarDigiProcessorLCIOConv.OutputLevel = INFO # Add it to OuterPlanarDigiProcessor Algorithm OuterPlanarDigiProcessor.Lcio2EDM4hepTool = OuterPlanarDigiProcessorLCIOConv @@ -259,7 +259,7 @@ "OTrackerEndcapHits": "OTrackerEndcapHits", "OuterTrackerEndcapHitsRelations": "OuterTrackerEndcapHitsRelations", } -OuterEndcapPlanarDigiProcessorLCIOConv.OutputLevel = DEBUG +OuterEndcapPlanarDigiProcessorLCIOConv.OutputLevel = INFO # Add it to OuterEndcapPlanarDigiProcessor Algorithm OuterEndcapPlanarDigiProcessor.Lcio2EDM4hepTool = OuterEndcapPlanarDigiProcessorLCIOConv @@ -543,7 +543,7 @@ "DebugHits": "DebugHits", "SiTracksCT": "SiTracksCT", } -MyConformalTrackingLCIOConv.OutputLevel = DEBUG +MyConformalTrackingLCIOConv.OutputLevel = INFO # Add it to MyConformalTracking Algorithm MyConformalTracking.Lcio2EDM4hepTool = MyConformalTrackingLCIOConv @@ -568,7 +568,7 @@ ClonesAndSplitTracksFinderLCIOConv = Lcio2EDM4hepTool("ClonesAndSplitTracksFinderLCIOConv") ClonesAndSplitTracksFinderLCIOConv.convertAll = False ClonesAndSplitTracksFinderLCIOConv.collNameMapping = {"SiTracks": "SiTracks"} -ClonesAndSplitTracksFinderLCIOConv.OutputLevel = DEBUG +ClonesAndSplitTracksFinderLCIOConv.OutputLevel = INFO # Add it to ClonesAndSplitTracksFinder Algorithm ClonesAndSplitTracksFinder.Lcio2EDM4hepTool = ClonesAndSplitTracksFinderLCIOConv @@ -593,7 +593,7 @@ RefitLCIOConv = Lcio2EDM4hepTool("Refit") RefitLCIOConv.convertAll = False RefitLCIOConv.collNameMapping = {"SiTracks_Refitted": "SiTracks_Refitted"} -RefitLCIOConv.OutputLevel = DEBUG +RefitLCIOConv.OutputLevel = INFO # Add it to RefitLCIOConv Algorithm Refit.Lcio2EDM4hepTool = RefitLCIOConv @@ -633,7 +633,7 @@ MyClicEfficiencyCalculatorLCIOConv = Lcio2EDM4hepTool("MyClicEfficiencyCalculator") MyClicEfficiencyCalculatorLCIOConv.convertAll = False MyClicEfficiencyCalculatorLCIOConv.collNameMapping = {"MCParticleNotReco": "MCParticleNotReco"} -MyClicEfficiencyCalculatorLCIOConv.OutputLevel = DEBUG +MyClicEfficiencyCalculatorLCIOConv.OutputLevel = INFO # Add it to MyClicEfficiencyCalculatorLCIOConv Algorithm MyClicEfficiencyCalculator.Lcio2EDM4hepTool = MyClicEfficiencyCalculatorLCIOConv @@ -749,7 +749,7 @@ "HCALOther": "HCALOther", "RelationCaloHit": "RelationCaloHit", } -MyDDCaloDigiLCIOConv.OutputLevel = DEBUG +MyDDCaloDigiLCIOConv.OutputLevel = INFO # Add it to MyDDCaloDigi Algorithm MyDDCaloDigi.Lcio2EDM4hepTool = MyDDCaloDigiLCIOConv @@ -889,7 +889,7 @@ "PandoraPFOs": "PandoraPFOs", "PandoraStartVertices": "PandoraStartVertices", } -MyDDMarlinPandoraLCIOConv.OutputLevel = DEBUG +MyDDMarlinPandoraLCIOConv.OutputLevel = INFO # Add it to MyDDMarlinPandora Algorithm MyDDMarlinPandora.Lcio2EDM4hepTool = MyDDMarlinPandoraLCIOConv @@ -909,7 +909,7 @@ MyDDSimpleMuonDigiLCIOConv = Lcio2EDM4hepTool("MyDDSimpleMuonDigiLCIOConv") MyDDSimpleMuonDigiLCIOConv.convertAll = False MyDDSimpleMuonDigiLCIOConv.collNameMapping = {"MUON": "MUON", "RelationMuonHit": "RelationMuonHit"} -MyDDSimpleMuonDigiLCIOConv.OutputLevel = DEBUG +MyDDSimpleMuonDigiLCIOConv.OutputLevel = INFO # Add it to MyDDSimpleMuonDigi Algorithm MyDDSimpleMuonDigi.Lcio2EDM4hepTool = MyDDSimpleMuonDigiLCIOConv @@ -978,7 +978,7 @@ "RecoMCTruthLink": "RecoMCTruthLink", "SiTracksMCTruthLink": "SiTracksMCTruthLink", } -MyRecoMCTruthLinkerLCIOConv.OutputLevel = DEBUG +MyRecoMCTruthLinkerLCIOConv.OutputLevel = INFO # Add it to MyRecoMCTruthLinker Algorithm MyRecoMCTruthLinker.Lcio2EDM4hepTool = MyRecoMCTruthLinkerLCIOConv @@ -1023,7 +1023,7 @@ } LumiCalReco = MarlinProcessorWrapper("LumiCalReco") -LumiCalReco.OutputLevel = DEBUG +LumiCalReco.OutputLevel = INFO LumiCalReco.ProcessorType = "BeamCalClusterReco" LumiCalReco.Parameters = { "BackgroundMethod": ["Empty"], @@ -1062,7 +1062,7 @@ "LumiCalClusters": "LumiCalClusters", "LumiCalRecoParticles": "LumiCalRecoParticles", } -LumiCalRecoLCIOConv.OutputLevel = DEBUG +LumiCalRecoLCIOConv.OutputLevel = INFO # Add it to LumiCalReco Algorithm LumiCalReco.Lcio2EDM4hepTool = LumiCalRecoLCIOConv @@ -1082,7 +1082,7 @@ RenameCollectionLCIOConv.collNameMapping = { "PFOsFromJets": "PFOsFromJets", } -RenameCollectionLCIOConv.OutputLevel = DEBUG +RenameCollectionLCIOConv.OutputLevel = INFO # Add it to RenameCollection Algorithm RenameCollection.Lcio2EDM4hepTool = RenameCollectionLCIOConv @@ -1101,7 +1101,7 @@ } OverlayFalse = MarlinProcessorWrapper("OverlayFalse") -OverlayFalse.OutputLevel = DEBUG +OverlayFalse.OutputLevel = INFO OverlayFalse.ProcessorType = "OverlayTimingGeneric" OverlayFalse.Parameters = { "BackgroundFileNames": [], @@ -1153,7 +1153,7 @@ OverlayFalseLCIOConv = Lcio2EDM4hepTool("OverlayFalseLCIOConv") OverlayFalseLCIOConv.convertAll = False OverlayFalseLCIOConv.collNameMapping = {"MCPhysicsParticles": "MCPhysicsParticles"} -OverlayFalseLCIOConv.OutputLevel = DEBUG +OverlayFalseLCIOConv.OutputLevel = INFO # Add it to OverlayFalse Algorithm OverlayFalse.Lcio2EDM4hepTool = OverlayFalseLCIOConv @@ -1721,7 +1721,7 @@ MergeRPLCIOConv = Lcio2EDM4hepTool("MergeRPLCIOConv") MergeRPLCIOConv.convertAll = False MergeRPLCIOConv.collNameMapping = {"MergedRecoParticles": "MergedRecoParticles"} -MergeRPLCIOConv.OutputLevel = DEBUG +MergeRPLCIOConv.OutputLevel = INFO # Add it to MergeRP Algorithm MergeRP.Lcio2EDM4hepTool = MergeRPLCIOConv @@ -1739,7 +1739,7 @@ MergeClustersLCIOConv = Lcio2EDM4hepTool("MergeClustersLCIOConv") MergeClustersLCIOConv.convertAll = False MergeClustersLCIOConv.collNameMapping = {"MergedClusters": "MergedClusters"} -MergeClustersLCIOConv.OutputLevel = DEBUG +MergeClustersLCIOConv.OutputLevel = INFO # Add it to MergeClusters Algorithm MergeClusters.Lcio2EDM4hepTool = MergeClustersLCIOConv @@ -1858,7 +1858,7 @@ CLICPfoSelectorDefault_HELCIOConv = Lcio2EDM4hepTool("CLICPfoSelectorDefault_HELCIOConv") CLICPfoSelectorDefault_HELCIOConv.convertAll = False CLICPfoSelectorDefault_HELCIOConv.collNameMapping = {"SelectedPandoraPFOs": "SelectedPandoraPFOs"} -CLICPfoSelectorDefault_HELCIOConv.OutputLevel = DEBUG +CLICPfoSelectorDefault_HELCIOConv.OutputLevel = INFO # Add it to CLICPfoSelectorDefault_HE Algorithm CLICPfoSelectorDefault_HE.Lcio2EDM4hepTool = CLICPfoSelectorDefault_HELCIOConv @@ -1917,7 +1917,7 @@ CLICPfoSelectorLoose_HELCIOConv = Lcio2EDM4hepTool("CLICPfoSelectorLoose_HELCIOConv") CLICPfoSelectorLoose_HELCIOConv.convertAll = False CLICPfoSelectorLoose_HELCIOConv.collNameMapping = {"CLICPfoSelectorLoose_HE": "CLICPfoSelectorLoose_HE"} -CLICPfoSelectorLoose_HELCIOConv.OutputLevel = DEBUG +CLICPfoSelectorLoose_HELCIOConv.OutputLevel = INFO # Add it to CLICPfoSelectorLoose_HE Algorithm CLICPfoSelectorLoose_HE.Lcio2EDM4hepTool = CLICPfoSelectorLoose_HELCIOConv @@ -1976,7 +1976,7 @@ CLICPfoSelectorTight_HELCIOConv = Lcio2EDM4hepTool("CLICPfoSelectorTight_HELCIOConv") CLICPfoSelectorTight_HELCIOConv.convertAll = False CLICPfoSelectorTight_HELCIOConv.collNameMapping = {"TightSelectedPandoraPFOs": "TightSelectedPandoraPFOs"} -CLICPfoSelectorTight_HELCIOConv.OutputLevel = DEBUG +CLICPfoSelectorTight_HELCIOConv.OutputLevel = INFO # Add it to CLICPfoSelectorTight_HE Algorithm CLICPfoSelectorTight_HE.Lcio2EDM4hepTool = CLICPfoSelectorTight_HELCIOConv @@ -2035,7 +2035,7 @@ CLICPfoSelectorDefault_LELCIOConv = Lcio2EDM4hepTool("CLICPfoSelectorDefault_LELCIOConv") CLICPfoSelectorDefault_LELCIOConv.convertAll = False CLICPfoSelectorDefault_LELCIOConv.collNameMapping = {"LE_SelectedPandoraPFOs": "LE_SelectedPandoraPFOs"} -CLICPfoSelectorDefault_LELCIOConv.OutputLevel = DEBUG +CLICPfoSelectorDefault_LELCIOConv.OutputLevel = INFO # Add it to CLICPfoSelectorDefault_LE Algorithm CLICPfoSelectorDefault_LE.Lcio2EDM4hepTool = CLICPfoSelectorDefault_LELCIOConv @@ -2094,7 +2094,7 @@ CLICPfoSelectorLoose_LELCIOConv = Lcio2EDM4hepTool("CLICPfoSelectorLoose_LELCIOConv") CLICPfoSelectorLoose_LELCIOConv.convertAll = False CLICPfoSelectorLoose_LELCIOConv.collNameMapping = {"LE_LooseSelectedPandoraPFOs": "LE_LooseSelectedPandoraPFOs"} -CLICPfoSelectorLoose_LELCIOConv.OutputLevel = DEBUG +CLICPfoSelectorLoose_LELCIOConv.OutputLevel = INFO # Add it to CLICPfoSelectorLoose_LE Algorithm CLICPfoSelectorLoose_LE.Lcio2EDM4hepTool = CLICPfoSelectorLoose_LELCIOConv @@ -2153,7 +2153,7 @@ CLICPfoSelectorTight_LELCIOConv = Lcio2EDM4hepTool("CLICPfoSelectorTight_LELCIOConv") CLICPfoSelectorTight_LELCIOConv.convertAll = False CLICPfoSelectorTight_LELCIOConv.collNameMapping = {"LE_TightSelectedPandoraPFOs": "LE_TightSelectedPandoraPFOs"} -CLICPfoSelectorTight_LELCIOConv.OutputLevel = DEBUG +CLICPfoSelectorTight_LELCIOConv.OutputLevel = INFO # Add it to CLICPfoSelectorTight_LE Algorithm CLICPfoSelectorTight_LE.Lcio2EDM4hepTool = CLICPfoSelectorTight_LELCIOConv @@ -2219,7 +2219,7 @@ "BuildUpVertices": "BuildUpVertices", "PrimaryVertices": "PrimaryVertices", } -VertexFinderLCIOConv.OutputLevel = DEBUG +VertexFinderLCIOConv.OutputLevel = INFO # Add it to VertexFinder Algorithm VertexFinder.Lcio2EDM4hepTool = VertexFinderLCIOConv diff --git a/scripts/fcc/postprocessing.py b/scripts/fcc/postprocessing.py index 4dae644ef..83750bb9e 100644 --- a/scripts/fcc/postprocessing.py +++ b/scripts/fcc/postprocessing.py @@ -6,6 +6,7 @@ import os import sys import multiprocessing +import tqdm from scipy.sparse import coo_matrix track_coll = "SiTracks_Refitted" @@ -203,11 +204,11 @@ def get_calohit_matrix_and_genadj(hit_data, calohit_links, iev, collectionIDs): ) # add all edges from genparticle to calohit - calohit_to_gen_weight = calohit_links["CalohitMCTruthLink"]["CalohitMCTruthLink.weight"][iev] - calohit_to_gen_calo_colid = calohit_links["CalohitMCTruthLink#0"]["CalohitMCTruthLink#0.collectionID"][iev] - calohit_to_gen_gen_colid = calohit_links["CalohitMCTruthLink#1"]["CalohitMCTruthLink#1.collectionID"][iev] - calohit_to_gen_calo_idx = calohit_links["CalohitMCTruthLink#0"]["CalohitMCTruthLink#0.index"][iev] - calohit_to_gen_gen_idx = calohit_links["CalohitMCTruthLink#1"]["CalohitMCTruthLink#1.index"][iev] + calohit_to_gen_weight = calohit_links["CalohitMCTruthLink.weight"][iev] + calohit_to_gen_calo_colid = calohit_links["CalohitMCTruthLink#0.collectionID"][iev] + calohit_to_gen_gen_colid = calohit_links["CalohitMCTruthLink#1.collectionID"][iev] + calohit_to_gen_calo_idx = calohit_links["CalohitMCTruthLink#0.index"][iev] + calohit_to_gen_gen_idx = calohit_links["CalohitMCTruthLink#1.index"][iev] genparticle_to_hit_matrix_coo0 = [] genparticle_to_hit_matrix_coo1 = [] genparticle_to_hit_matrix_w = [] @@ -292,9 +293,9 @@ def gen_to_features(prop_data, iev): def genparticle_track_adj(sitrack_links, iev): - trk_to_gen_trkidx = sitrack_links["SiTracksMCTruthLink#0"]["SiTracksMCTruthLink#0.index"][iev] - trk_to_gen_genidx = sitrack_links["SiTracksMCTruthLink#1"]["SiTracksMCTruthLink#1.index"][iev] - trk_to_gen_w = sitrack_links["SiTracksMCTruthLink"]["SiTracksMCTruthLink.weight"][iev] + trk_to_gen_trkidx = sitrack_links["SiTracksMCTruthLink#0.index"][iev] + trk_to_gen_genidx = sitrack_links["SiTracksMCTruthLink#1.index"][iev] + trk_to_gen_w = sitrack_links["SiTracksMCTruthLink.weight"][iev] genparticle_to_track_matrix_coo0 = awkward.to_numpy(trk_to_gen_genidx) genparticle_to_track_matrix_coo1 = awkward.to_numpy(trk_to_gen_trkidx) @@ -454,7 +455,7 @@ def get_genparticles_and_adjacencies(prop_data, hit_data, calohit_links, sitrack gp_interacted_with_detector = gp_in_tracker | gp_in_calo mask_visible = (gen_features["energy"] > 0.01) & gp_interacted_with_detector - print("gps total={} visible={}".format(n_gp, np.sum(mask_visible))) + # print("gps total={} visible={}".format(n_gp, np.sum(mask_visible))) idx_all_masked = np.where(mask_visible)[0] genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} @@ -702,6 +703,7 @@ def process_one_file(fn, ofn): print("{} exists".format(ofn)) return + print("loading {}".format(fn)) fi = uproot.open(fn) arrs = fi["events"] @@ -725,8 +727,24 @@ def process_one_file(fn, ofn): "MergedRecoParticles", ] ) - calohit_links = arrs.arrays(["CalohitMCTruthLink", "CalohitMCTruthLink#0", "CalohitMCTruthLink#1"]) - sitrack_links = arrs.arrays(["SiTracksMCTruthLink", "SiTracksMCTruthLink#0", "SiTracksMCTruthLink#1"]) + calohit_links = arrs.arrays( + [ + "CalohitMCTruthLink.weight", + "CalohitMCTruthLink#0.index", + "CalohitMCTruthLink#0.collectionID", + "CalohitMCTruthLink#1.index", + "CalohitMCTruthLink#1.collectionID", + ] + ) + sitrack_links = arrs.arrays( + [ + "SiTracksMCTruthLink.weight", + "SiTracksMCTruthLink#0.index", + "SiTracksMCTruthLink#0.collectionID", + "SiTracksMCTruthLink#1.index", + "SiTracksMCTruthLink#1.collectionID", + ] + ) # maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) # to the index in the track/cluster collection @@ -744,7 +762,7 @@ def process_one_file(fn, ofn): } ret = [] - for iev in range(arrs.num_entries): + for iev in tqdm.tqdm(range(arrs.num_entries), total=arrs.num_entries): # get the reco particles reco_arr = get_reco_properties(prop_data, iev) @@ -835,7 +853,7 @@ def process_one_file(fn, ofn): sanitize(X_track) sanitize(X_cluster) - print("X_track={} X_cluster={}".format(len(X_track), len(X_cluster))) + # print("X_track={} X_cluster={}".format(len(X_track), len(X_cluster))) sanitize(ygen_track) sanitize(ygen_cluster) sanitize(ycand_track) @@ -858,10 +876,10 @@ def process_one_file(fn, ofn): def process_sample(sample): - inp = "/local/joosep/clic_edm4hep_2023_02_27/" - outp = "/local/joosep/mlpf/clic_edm4hep_2023_05_09/" + inp = "/local/joosep/clic_edm4hep/" + outp = "/local/joosep/mlpf/clic_edm4hep_2023_12_15/" - pool = multiprocessing.Pool(16) + pool = multiprocessing.Pool(4) inpath_samp = inp + sample outpath_samp = outp + sample @@ -869,6 +887,9 @@ def process_sample(sample): if not os.path.isdir(outpath_samp): os.makedirs(outpath_samp) + # for inf in infiles: + # of = inf.replace(inpath_samp, outpath_samp).replace(".root", ".parquet") + # process_one_file(inf, of) args = [] for inf in infiles: of = inf.replace(inpath_samp, outpath_samp).replace(".root", ".parquet") diff --git a/scripts/fcc/postprocessing_hits.py b/scripts/fcc/postprocessing_hits.py index 3009fa90e..64826f7ea 100644 --- a/scripts/fcc/postprocessing_hits.py +++ b/scripts/fcc/postprocessing_hits.py @@ -5,6 +5,7 @@ import os import sys import multiprocessing +import tqdm from scipy.sparse import coo_matrix from postprocessing import map_pdgid_to_candid, map_charged_to_neutral, map_neutral_to_charged, sanitize @@ -20,13 +21,14 @@ "p", "chi2", "ndf", + "dEdx", + "dEdxError", "radiusOfInnermostHit", "tanLambda", "D0", "omega", "Z0", "time", - "type", ] hit_feature_order = [ "elemtype", @@ -184,8 +186,24 @@ def process_one_file(fn, ofn): "MergedRecoParticles", ] ) - calohit_links = arrs.arrays(["CalohitMCTruthLink", "CalohitMCTruthLink#0", "CalohitMCTruthLink#1"]) - sitrack_links = arrs.arrays(["SiTracksMCTruthLink", "SiTracksMCTruthLink#0", "SiTracksMCTruthLink#1"]) + calohit_links = arrs.arrays( + [ + "CalohitMCTruthLink.weight", + "CalohitMCTruthLink#0.index", + "CalohitMCTruthLink#0.collectionID", + "CalohitMCTruthLink#1.index", + "CalohitMCTruthLink#1.collectionID", + ] + ) + sitrack_links = arrs.arrays( + [ + "SiTracksMCTruthLink.weight", + "SiTracksMCTruthLink#0.index", + "SiTracksMCTruthLink#0.collectionID", + "SiTracksMCTruthLink#1.index", + "SiTracksMCTruthLink#1.collectionID", + ] + ) # maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) # to the index in the track/cluster collection @@ -203,7 +221,7 @@ def process_one_file(fn, ofn): } ret = [] - for iev in range(arrs.num_entries): + for iev in tqdm.tqdm(range(arrs.num_entries), total=arrs.num_entries): # get the reco particles reco_arr = get_reco_properties(prop_data, iev) @@ -230,7 +248,7 @@ def process_one_file(fn, ofn): n_tracks = len(gpdata.track_features["type"]) n_hits = len(gpdata.hit_features["type"]) n_gps = len(gpdata.gen_features["PDG"]) - print("hits={} tracks={} gps={}".format(n_hits, n_tracks, n_gps)) + # print("hits={} tracks={} gps={}".format(n_hits, n_tracks, n_gps)) assert len(gp_to_obj) == len(gpdata.gen_features["PDG"]) assert gp_to_obj.shape[1] == 2 diff --git a/scripts/fcc/run_pandora_timing.sh b/scripts/fcc/run_pandora_timing.sh index efc848d58..e548a114a 100755 --- a/scripts/fcc/run_pandora_timing.sh +++ b/scripts/fcc/run_pandora_timing.sh @@ -1,8 +1,11 @@ #!/bin/bash -SLURM_JOB_ID=1 ./run_sim_gun_np.sh 1 pi- 100 &> gun_np_100_1.txt -#for iseed in 6 7; do -# for nptcl in 25 50 100 200; do -# SLURM_JOB_ID=$iseed ./run_sim_gun_np.sh $iseed pi- $nptcl &> gun_np_${nptcl}_${iseed}.txt -# done -#done +#Important: ensure turbo boost is disabled for consistent timing +#echo "1" | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo + +#SLURM_JOB_ID=1 ./run_sim_gun_np.sh 1 pi- 100 &> gun_np_100_1.txt +for iseed in 1 2 3; do + for nptcl in 25 50 100 200; do + SLURM_JOB_ID=$iseed ./run_sim_gun_np.sh $iseed pi- $nptcl &> gun_np_${nptcl}_${iseed}.txt + done +done diff --git a/scripts/fcc/run_sim_gun_np.sh b/scripts/fcc/run_sim_gun_np.sh index bac7c4435..d8788ff04 100755 --- a/scripts/fcc/run_sim_gun_np.sh +++ b/scripts/fcc/run_sim_gun_np.sh @@ -44,7 +44,7 @@ ddsim --compactFile $LCGEO/CLIC/compact/CLIC_o3_v14/CLIC_o3_v14.xml \ --random.seed $NUM cp out_sim_edm4hep.root $FULLOUTDIR/sim_${SAMPLE}_${NUM}.root -k4run clicRec_e4h_input.py -n $NEV --EventDataSvc.input out_sim_edm4hep.root --PodioOutput.filename out_reco_edm4hep.root +\time -v k4run clicRec_e4h_input.py -n $NEV --EventDataSvc.input out_sim_edm4hep.root --PodioOutput.filename out_reco_edm4hep.root cp out_reco_edm4hep.root $FULLOUTDIR/reco_${SAMPLE}_${NUM}.root cp timing_histos.root $FULLOUTDIR/timing_${SAMPLE}_${NUM}.root diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index b45eaed09..4b5af9bac 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -1,8 +1,8 @@ #!/bin/bash # Tallinn -export MANUAL_DIR=/local/joosep/mlpf/cms/v2 -export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets +export MANUAL_DIR=/local/joosep/mlpf/cms/v3 +export DATA_DIR=/local/joosep/mlpf/cms/v3/tensorflow_datasets export IMG=/home/software/singularity/tf-2.14.0.simg export PYTHONPATH=`pwd`/mlpf export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$PYTHONPATH $IMG tfds build " @@ -14,20 +14,21 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # CMD="singularity exec -B /media/joosep/data --env PYTHONPATH=$PYTHONPATH $IMG tfds build " # CMS -# $CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar.log & -# $CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qcd.log & -# $CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ztt.log & -# $CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qcd_high_pt.log & -# $CMD mlpf/heptfds/cms_pf/smst1tttt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_smst1tttt.log & -# $CMD mlpf/heptfds/cms_pf/singlepi0 --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singlepi0.log & -# $CMD mlpf/heptfds/cms_pf/singleneutron --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singleneutron.log & -# $CMD mlpf/heptfds/cms_pf/singleele --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singleele.log & -# $CMD mlpf/heptfds/cms_pf/singlegamma --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singlegamma.log & -# $CMD mlpf/heptfds/cms_pf/singlemu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singlemu.log & -# $CMD mlpf/heptfds/cms_pf/singlepi --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singlepi.log & -# $CMD mlpf/heptfds/cms_pf/singleproton --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singleproton.log & -# $CMD mlpf/heptfds/cms_pf/singletau --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singletau.log & -# $CMD mlpf/heptfds/cms_pf/multiparticlegun --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_multiparticlegun.log & +#$CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log & +#$CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd.log & +#$CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ztt.log & +#$CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd_high_pt.log & +#$CMD mlpf/heptfds/cms_pf/smst1tttt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_smst1tttt.log & +#$CMD mlpf/heptfds/cms_pf/vbf --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_vbf.log & +#$CMD mlpf/heptfds/cms_pf/singlepi0 --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singlepi0.log & +#$CMD mlpf/heptfds/cms_pf/singleneutron --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singleneutron.log & +#$CMD mlpf/heptfds/cms_pf/singleele --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singleele.log & +#$CMD mlpf/heptfds/cms_pf/singlegamma --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singlegamma.log & +#$CMD mlpf/heptfds/cms_pf/singlemu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singlemu.log & +#$CMD mlpf/heptfds/cms_pf/singlepi --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singlepi.log & +#$CMD mlpf/heptfds/cms_pf/singleproton --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singleproton.log & +#$CMD mlpf/heptfds/cms_pf/singletau --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_singletau.log & +#$CMD mlpf/heptfds/cms_pf/multiparticlegun --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_multiparticlegun.log & # wait # CLIC cluster-based @@ -42,9 +43,10 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # CLIC hit-based # export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep_hits/ +# export DATA_DIR=/local/joosep/mlpf/tensorflow_datasets/clic/hits/ # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits.log & -# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits_10k.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits_10k.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar_hits_10k.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_kaon0L_hits.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_ele --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_ele_hits.log & diff --git a/scripts/local_test_clic_hits_pipeline.sh b/scripts/local_test_clic_hits_pipeline.sh index 69712d9e3..38909c113 100755 --- a/scripts/local_test_clic_hits_pipeline.sh +++ b/scripts/local_test_clic_hits_pipeline.sh @@ -19,6 +19,6 @@ python3 scripts/fcc/postprocessing_hits.py data/p8_ee_tt_ecm380/reco_p8_ee_tt_ec tfds build mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --manual_dir data # #Train, evaluate and make plots -python mlpf/pipeline.py train --config parameters/clic-hits.yaml --nepochs 1 --customize pipeline_test --ntrain 1 --ntest 1 +python mlpf/pipeline.py train --config parameters/tensorflow/clic-hits.yaml --nepochs 1 --customize pipeline_test --ntrain 1 --ntest 1 python mlpf/pipeline.py evaluate --nevents 10 --customize pipeline_test --train-dir ./experiments/clic* --weights ./experiments/clic*/weights/weights-01-*.hdf5 python mlpf/pipeline.py plots --train-dir ./experiments/clic* diff --git a/scripts/local_test_clic_pipeline.sh b/scripts/local_test_clic_pipeline.sh index 6262b9fe9..4c827a1ff 100755 --- a/scripts/local_test_clic_pipeline.sh +++ b/scripts/local_test_clic_pipeline.sh @@ -19,6 +19,6 @@ python3 scripts/fcc/postprocessing.py data/p8_ee_tt_ecm380/reco_p8_ee_tt_ecm380_ tfds build mlpf/heptfds/clic_pf_edm4hep/ttbar --manual_dir data # #Train, evaluate and make plots -python mlpf/pipeline.py train --config parameters/clic.yaml --nepochs 1 --customize pipeline_test --ntrain 10 --ntest 10 +python mlpf/pipeline.py train --config parameters/tensorflow/clic.yaml --nepochs 1 --customize pipeline_test --ntrain 10 --ntest 10 python mlpf/pipeline.py evaluate --nevents 10 --customize pipeline_test --train-dir ./experiments/clic* --weights ./experiments/clic*/weights/weights-01-*.hdf5 python mlpf/pipeline.py plots --train-dir ./experiments/clic* diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh index 4e795f97b..2c448b8d3 100755 --- a/scripts/local_test_cms_pipeline.sh +++ b/scripts/local_test_cms_pipeline.sh @@ -28,7 +28,7 @@ mkdir -p experiments tfds build mlpf/heptfds/cms_pf/ttbar --manual_dir ./local_test_data #Run a simple training on a few events -python mlpf/pipeline.py train --config parameters/cms-gen.yaml --nepochs 1 --customize pipeline_test +python mlpf/pipeline.py train --config parameters/tensorflow/cms.yaml --nepochs 1 --customize pipeline_test ls ./experiments/cms*/weights/ @@ -39,4 +39,4 @@ python mlpf/pipeline.py evaluate --nevents 5 --customize pipeline_test --train-d python mlpf/pipeline.py plots --train-dir ./experiments/cms* #Retrain from existing weights -python mlpf/pipeline.py train --config parameters/cms-gen.yaml --nepochs 1 --customize pipeline_test --weights ./experiments/cms*/weights/weights-01-*.hdf5 +python mlpf/pipeline.py train --config parameters/tensorflow/cms.yaml --nepochs 1 --customize pipeline_test --weights ./experiments/cms*/weights/weights-01-*.hdf5 diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh index 692b8e090..2da12f47f 100755 --- a/scripts/local_test_delphes_pipeline.sh +++ b/scripts/local_test_delphes_pipeline.sh @@ -18,7 +18,7 @@ tfds build mlpf/heptfds/delphes_pf/delphes_ttbar_pf --download_dir data/ --manua tfds build mlpf/heptfds/delphes_pf/delphes_qcd_pf --download_dir data/ --manual_dir data/delphes_pf #Run a simple training on a few events -python mlpf/pipeline.py train --config parameters/delphes.yaml --nepochs 1 --ntrain 5 --ntest 5 --customize pipeline_test +python mlpf/pipeline.py train --config parameters/tensorflow/delphes.yaml --nepochs 1 --ntrain 5 --ntest 5 --customize pipeline_test #Check the weight files ls ./experiments/delphes_*/weights/ diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh deleted file mode 100755 index 4d0fa84be..000000000 --- a/scripts/local_test_delphes_pytorch.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -set -e - -rm -Rf test_tmp_delphes -mkdir test_tmp_delphes -cd test_tmp_delphes - -mkdir -p experiments -mkdir -p data/pythia8_ttbar -mkdir -p data/pythia8_ttbar/raw -mkdir -p data/pythia8_ttbar/processed - -mkdir -p data/pythia8_qcd -mkdir -p data/pythia8_qcd/raw -mkdir -p data/pythia8_qcd/processed - -#download 2 files for training/validation -cd data/pythia8_ttbar/raw -echo Downloading the training/validation data files.. -wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2 -wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2 -bzip2 -d * -cd ../../.. - -#download 1 file for testing -cd data/pythia8_qcd/raw -echo Downloading the testing data files.. -wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2 -bzip2 -d * -cd ../../.. - -#generate pytorch data files from pkl files -echo Processing the training/validation data files.. -python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_ttbar \ - --processed_dir data/pythia8_ttbar/processed --num-files-merge 1 --num-proc 1 - -#generate pytorch data files from pkl files -echo Processing the testing data files.. -python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_qcd/ \ - --processed_dir data/pythia8_qcd/processed --num-files-merge 1 --num-proc 1 - -#before training a model, first get rid of any previous models stored -rm -Rf experiments/* - -cd ../mlpf/ - -#run the pytorch training -echo Beginning the training.. -python3 pytorch_pipeline.py \ - --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \ - --dataset='../test_tmp_delphes/data/pythia8_ttbar' \ - --dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \ - --outpath='../test_tmp_delphes/experiments' -echo Finished the training.. - -# #to run lrp uncomment the next few lines (note: lrp requires huge amounts of memory ~128Gi) -# echo Begining the LRP machinery.. -# python3 lrp_pipeline.py \ -# --n_test=1 --batch_size=4 \ -# --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \ -# --lrp_outpath='../test_tmp_delphes/experiments/' \ -# --lrp_load_model='PFNet7_gen_ntrain_1_nepochs_10_batch_size_4_lr_0.0001_alpha_0.0002_both__nn1_nn3' -# --lrp_load_epoch=9 diff --git a/scripts/local_test_pyg.sh b/scripts/local_test_pyg.sh index 5031cce33..b9df0c145 100755 --- a/scripts/local_test_pyg.sh +++ b/scripts/local_test_pyg.sh @@ -27,6 +27,11 @@ mkdir -p experiments tfds build mlpf/heptfds/cms_pf/ttbar --manual_dir ./local_test_data -python mlpf/pyg_pipeline.py --config parameters/pyg-workflow-test.yaml --dataset cms --data-dir ./tensorflow_datasets/ --prefix MLPF_test_ --nvalid 1 --gpus 0 --train --test --make-plots +#test gravnet +python mlpf/pyg_pipeline.py --config parameters/pytorch/pyg-cms.yaml --dataset cms --data-dir ./tensorflow_datasets/ --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --test --make-plots --conv-type gravnet --pipeline -python mlpf/pyg_pipeline.py --config parameters/pyg-workflow-test.yaml --dataset cms --data-dir ./tensorflow_datasets/ --prefix MLPF_test_ --nvalid 1 --gpus 0 --train --test --make-plots --conv-type gnn_lsh --export-onnx +#test transformer +python mlpf/pyg_pipeline.py --config parameters/pytorch/pyg-cms.yaml --dataset cms --data-dir ./tensorflow_datasets/ --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --test --make-plots --conv-type attention --pipeline + +#test GNN-LSH with export +python mlpf/pyg_pipeline.py --config parameters/pytorch/pyg-cms.yaml --dataset cms --data-dir ./tensorflow_datasets/ --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --test --make-plots --conv-type gnn_lsh --export-onnx --pipeline diff --git a/scripts/lumi/clic_bin_size_128.sh b/scripts/lumi/clic_bin_size_128.sh new file mode 100644 index 000000000..5dc778376 --- /dev/null +++ b/scripts/lumi/clic_bin_size_128.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=mlpf-train-clic +#SBATCH --account=project_465000301 +#SBATCH --time=3-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=160G +#SBATCH --gpus-per-task=4 +#SBATCH --partition=small-g +#SBATCH --no-requeue +#SBATCH -o logs/slurm-%x-%j-%N.out + +cd /scratch/project_465000301/particleflow + +module load LUMI/22.08 partition/G + +export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export PYTHONPATH=hep_tfds +export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets +#export MIOPEN_DISABLE_CACHE=true +export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages +export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN +#export MIOPEN_ENABLE_LOGGING=1 +#export MIOPEN_ENABLE_LOGGING_CMD=1 +#export MIOPEN_LOG_LEVEL=4 + +singularity exec \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram + +#TF training +singularity exec \ + --rocm \ + -B /scratch/project_465000301 \ + -B /tmp \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + $IMG python3 mlpf/pipeline.py train \ + --config parameters/clic/clic_bin_size_128.yaml --plot-freq 1 --num-cpus 32 \ + --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_256.sh b/scripts/lumi/clic_bin_size_256.sh new file mode 100644 index 000000000..fd1612fe5 --- /dev/null +++ b/scripts/lumi/clic_bin_size_256.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=mlpf-train-clic +#SBATCH --account=project_465000301 +#SBATCH --time=3-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=160G +#SBATCH --gpus-per-task=4 +#SBATCH --partition=small-g +#SBATCH --no-requeue +#SBATCH -o logs/slurm-%x-%j-%N.out + +cd /scratch/project_465000301/particleflow + +module load LUMI/22.08 partition/G + +export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export PYTHONPATH=hep_tfds +export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets +#export MIOPEN_DISABLE_CACHE=true +export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages +export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN +#export MIOPEN_ENABLE_LOGGING=1 +#export MIOPEN_ENABLE_LOGGING_CMD=1 +#export MIOPEN_LOG_LEVEL=4 + +singularity exec \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram + +#TF training +singularity exec \ + --rocm \ + -B /scratch/project_465000301 \ + -B /tmp \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + $IMG python3 mlpf/pipeline.py train \ + --config parameters/clic/clic_bin_size_256.yaml --plot-freq 1 --num-cpus 32 \ + --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_32.sh b/scripts/lumi/clic_bin_size_32.sh new file mode 100755 index 000000000..f9fae6c14 --- /dev/null +++ b/scripts/lumi/clic_bin_size_32.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=mlpf-train-clic +#SBATCH --account=project_465000301 +#SBATCH --time=3-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=160G +#SBATCH --gpus-per-task=4 +#SBATCH --partition=small-g +#SBATCH --no-requeue +#SBATCH -o logs/slurm-%x-%j-%N.out + +cd /scratch/project_465000301/particleflow + +module load LUMI/22.08 partition/G + +export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export PYTHONPATH=hep_tfds +export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets +#export MIOPEN_DISABLE_CACHE=true +export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages +export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN +#export MIOPEN_ENABLE_LOGGING=1 +#export MIOPEN_ENABLE_LOGGING_CMD=1 +#export MIOPEN_LOG_LEVEL=4 + +singularity exec \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram + +#TF training +singularity exec \ + --rocm \ + -B /scratch/project_465000301 \ + -B /tmp \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + $IMG python3 mlpf/pipeline.py train \ + --config parameters/clic/clic_bin_size_32.yaml --plot-freq 1 --num-cpus 32 \ + --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_512.sh b/scripts/lumi/clic_bin_size_512.sh new file mode 100644 index 000000000..cecd82a17 --- /dev/null +++ b/scripts/lumi/clic_bin_size_512.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=mlpf-train-clic +#SBATCH --account=project_465000301 +#SBATCH --time=3-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=160G +#SBATCH --gpus-per-task=4 +#SBATCH --partition=small-g +#SBATCH --no-requeue +#SBATCH -o logs/slurm-%x-%j-%N.out + +cd /scratch/project_465000301/particleflow + +module load LUMI/22.08 partition/G + +export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export PYTHONPATH=hep_tfds +export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets +#export MIOPEN_DISABLE_CACHE=true +export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages +export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN +#export MIOPEN_ENABLE_LOGGING=1 +#export MIOPEN_ENABLE_LOGGING_CMD=1 +#export MIOPEN_LOG_LEVEL=4 + +singularity exec \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram + +#TF training +singularity exec \ + --rocm \ + -B /scratch/project_465000301 \ + -B /tmp \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + $IMG python3 mlpf/pipeline.py train \ + --config parameters/clic/clic_bin_size_512.yaml --plot-freq 1 --num-cpus 32 \ + --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/clic_bin_size_64.sh b/scripts/lumi/clic_bin_size_64.sh new file mode 100755 index 000000000..02f0343f7 --- /dev/null +++ b/scripts/lumi/clic_bin_size_64.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=mlpf-train-clic +#SBATCH --account=project_465000301 +#SBATCH --time=3-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=160G +#SBATCH --gpus-per-task=4 +#SBATCH --partition=small-g +#SBATCH --no-requeue +#SBATCH -o logs/slurm-%x-%j-%N.out + +cd /scratch/project_465000301/particleflow + +module load LUMI/22.08 partition/G + +export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export PYTHONPATH=hep_tfds +export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets +#export MIOPEN_DISABLE_CACHE=true +export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages +export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN +#export MIOPEN_ENABLE_LOGGING=1 +#export MIOPEN_ENABLE_LOGGING_CMD=1 +#export MIOPEN_LOG_LEVEL=4 + +singularity exec \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + --rocm $IMG rocm-smi --showdriverversion --showmeminfo vram + +#TF training +singularity exec \ + --rocm \ + -B /scratch/project_465000301 \ + -B /tmp \ + --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ + $IMG python3 mlpf/pipeline.py train \ + --config parameters/clic/clic_bin_size_64.yaml --plot-freq 1 --num-cpus 32 \ + --batch-multiplier 5 --plot-freq 1 diff --git a/scripts/lumi/pytorch.sh b/scripts/lumi/pytorch.sh index 071490806..2b20940d8 100755 --- a/scripts/lumi/pytorch.sh +++ b/scripts/lumi/pytorch.sh @@ -6,7 +6,7 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 #SBATCH --mem=130G -#SBATCH --gpus-per-task=8 +#SBATCH --gpus-per-task=4 #SBATCH --partition=small-g #SBATCH --no-requeue #SBATCH -o logs/slurm-%x-%j-%N.out @@ -22,17 +22,19 @@ export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN #export MIOPEN_ENABLE_LOGGING=1 #export MIOPEN_ENABLE_LOGGING_CMD=1 #export MIOPEN_LOG_LEVEL=4 +env + singularity exec --rocm \ -B /scratch/project_465000301 \ -B /tmp \ --env PYTHONPATH=hep_tfds \ - $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 8 \ - --data-dir $TFDS_DATA_DIR --config parameters/pyg-cms.yaml \ + $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus $SLURM_GPUS_PER_TASK \ + --data-dir $TFDS_DATA_DIR --config parameters/pytorch/pyg-cms.yaml \ --train \ --conv-type gnn_lsh \ - --gpus 8 \ - --num-epochs 100 --gpu-batch-multiplier 10 --num-workers 1 --prefetch-factor 10 --load experiments/pyg-cms_20231129_193432_982100 + --num-epochs 20 --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 5 --checkpoint-freq 1 diff --git a/scripts/lumi/train-gpu-1.sh b/scripts/lumi/train-gpu-1.sh index 0a9786ad0..01a838bb5 100755 --- a/scripts/lumi/train-gpu-1.sh +++ b/scripts/lumi/train-gpu-1.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --job-name=mlpf-train-cms #SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 +#SBATCH --time=3-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G +#SBATCH --cpus-per-task=16 +#SBATCH --mem=160G #SBATCH --gpus-per-task=1 #SBATCH --partition=small-g #SBATCH --no-requeue @@ -15,7 +15,7 @@ cd /scratch/project_465000301/particleflow module load LUMI/22.08 partition/G -export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg +export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12-2024-01-11.simg export PYTHONPATH=hep_tfds export TFDS_DATA_DIR=/scratch/project_465000301/tensorflow_datasets #export MIOPEN_DISABLE_CACHE=true @@ -23,6 +23,7 @@ export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages export ROCM_PATH=/opt/rocm +export NCCL_DEBUG=WARN #export MIOPEN_ENABLE_LOGGING=1 #export MIOPEN_ENABLE_LOGGING_CMD=1 #export MIOPEN_LOG_LEVEL=4 @@ -37,6 +38,4 @@ singularity exec \ -B /scratch/project_465000301 \ -B /tmp \ --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ - $IMG python3 mlpf/pipeline.py train \ - --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 4 --plot-freq -1 + $IMG python3.9 mlpf/pipeline.py hypertune --config parameters/bench/clic-hits-bench.yaml --ntrain 100 --ntest 100 -o hypertuning_100 diff --git a/scripts/lumi/train-gpu-4.sh b/scripts/lumi/train-gpu-4.sh index 78ad47569..bcfa28b68 100755 --- a/scripts/lumi/train-gpu-4.sh +++ b/scripts/lumi/train-gpu-4.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --job-name=mlpf-train-cms #SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 +#SBATCH --time=3-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G +#SBATCH --cpus-per-task=16 +#SBATCH --mem=160G #SBATCH --gpus-per-task=4 #SBATCH --partition=small-g #SBATCH --no-requeue @@ -23,6 +23,7 @@ export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages export ROCM_PATH=/opt/rocm +#export NCCL_DEBUG=WARN #export MIOPEN_ENABLE_LOGGING=1 #export MIOPEN_ENABLE_LOGGING_CMD=1 #export MIOPEN_LOG_LEVEL=4 @@ -39,4 +40,4 @@ singularity exec \ --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ $IMG python3 mlpf/pipeline.py train \ --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 4 --plot-freq -1 + --batch-multiplier 2 --plot-freq -1 --weights experiments/cms-gen_20240108_154245_299103.nid005026/weights/weights-05-4.250307.hdf5 diff --git a/scripts/lumi/train-gpu-8.sh b/scripts/lumi/train-gpu-8.sh index 61936767f..3e2f3ea23 100755 --- a/scripts/lumi/train-gpu-8.sh +++ b/scripts/lumi/train-gpu-8.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --job-name=mlpf-train-cms #SBATCH --account=project_465000301 -#SBATCH --time=1-00:00:00 +#SBATCH --time=3-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=130G +#SBATCH --cpus-per-task=16 +#SBATCH --mem=160G #SBATCH --gpus-per-task=8 #SBATCH --partition=small-g #SBATCH --no-requeue @@ -13,7 +13,8 @@ cd /scratch/project_465000301/particleflow -module load LUMI/22.08 partition/G +#module load LUMI/22.08 partition/G +module load LUMI/23.09 partition/G export IMG=/scratch/project_465000301/tf-rocm5.6-tf2.12.simg export PYTHONPATH=hep_tfds @@ -23,9 +24,10 @@ export MIOPEN_USER_DB_PATH=/tmp/${USER}-${SLURM_JOB_ID}-miopen-cache export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} export TF_CPP_MAX_VLOG_LEVEL=-1 #to suppress ROCm fusion is enabled messages export ROCM_PATH=/opt/rocm -#export MIOPEN_ENABLE_LOGGING=1 -#export MIOPEN_ENABLE_LOGGING_CMD=1 -#export MIOPEN_LOG_LEVEL=4 +export NCCL_DEBUG=WARN +export MIOPEN_ENABLE_LOGGING=1 +export MIOPEN_ENABLE_LOGGING_CMD=1 +export MIOPEN_LOG_LEVEL=4 singularity exec \ --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ @@ -39,4 +41,4 @@ singularity exec \ --env LD_LIBRARY_PATH=/opt/rocm/lib/ \ $IMG python3 mlpf/pipeline.py train \ --config parameters/cms-gen.yaml --plot-freq 1 --num-cpus 8 \ - --batch-multiplier 4 --plot-freq -1 + --batch-multiplier 2 --plot-freq -1 --weights experiments/cms-gen_20240108_154245_299103.nid005026/weights/weights-05-4.250307.hdf5 diff --git a/scripts/tallinn/a100/clic-hits-train.sh b/scripts/tallinn/a100/clic-hits-train.sh index fc36ed61f..ee87b1ccd 100755 --- a/scripts/tallinn/a100/clic-hits-train.sh +++ b/scripts/tallinn/a100/clic-hits-train.sh @@ -11,5 +11,5 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python3.10 mlpf/pipeline.py train -c parameters/bench/clic-hits-bench.yaml \ - --plot-freq 1 --num-cpus 32 --batch-multiplier 1 + $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/clic-hits.yaml \ + --plot-freq 1 --num-cpus 32 --batch-multiplier 8 diff --git a/scripts/tallinn/a100/clic-train-hvd.sh b/scripts/tallinn/a100/clic-train-hvd.sh deleted file mode 100755 index 773db3f97..000000000 --- a/scripts/tallinn/a100/clic-train-hvd.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -#SBATCH --partition gpu -#SBATCH --gres gpu:a100:2 -#SBATCH --mem-per-gpu 40G -#SBATCH -o logs/slurm-%x-%j-%N.out - -IMG=/home/software/singularity/tf-2.14.0.simg -cd ~/particleflow - -#TF training -singularity exec -B /scratch/persistent --nv \ - --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG horovodrun -np 2 -H localhost:2 python3.10 mlpf/pipeline.py train -c parameters/clic-test.yaml \ - --plot-freq 0 --num-cpus 32 --batch-multiplier 5 \ - --horovod-enabled --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir - -# --env TF_GPU_THREAD_MODE=gpu_private \ -# --env TF_GPU_THREAD_COUNT=8 \ -# --env TF_XLA_FLAGS="--tf_xla_auto_jit=2" \ diff --git a/scripts/tallinn/a100/clic-train.sh b/scripts/tallinn/a100/clic-train.sh index 3d096f02d..4a4004ec1 100755 --- a/scripts/tallinn/a100/clic-train.sh +++ b/scripts/tallinn/a100/clic-train.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --partition gpu -#SBATCH --gres gpu:a100:2 +#SBATCH --gres gpu:a100:1 #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out @@ -11,5 +11,5 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python3.10 mlpf/pipeline.py train -c parameters/clic.yaml \ - --plot-freq 1 --num-cpus 32 --batch-multiplier 5 + $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/clic.yaml \ + --plot-freq 1 --num-cpus 32 --batch-multiplier 8 diff --git a/scripts/tallinn/a100/cms-train.sh b/scripts/tallinn/a100/cms-train.sh index ce12499ed..3ea5adaa5 100755 --- a/scripts/tallinn/a100/cms-train.sh +++ b/scripts/tallinn/a100/cms-train.sh @@ -13,5 +13,5 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python3.10 mlpf/pipeline.py train -c parameters/cms-gen.yaml --plot-freq 1 --num-cpus 32 \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/cms.yaml --plot-freq 1 --num-cpus 32 \ --batch-multiplier 5 diff --git a/scripts/tallinn/a100/eval.sh b/scripts/tallinn/a100/eval.sh index 1c7d1c74f..9e6c8456f 100755 --- a/scripts/tallinn/a100/eval.sh +++ b/scripts/tallinn/a100/eval.sh @@ -8,8 +8,8 @@ IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #change these -EXPDIR=experiments/cms-gen_20231206_182649_456797.gpu1.local -WEIGHTS=experiments/cms-gen_20231206_182649_456797.gpu1.local/weights/weights-01-4.213115.hdf5 +EXPDIR=experiments/cms-gen_20231213_152224_108072.gpu1.local +WEIGHTS=experiments/cms-gen_20231213_152224_108072.gpu1.local/weights/weights-10-3.068836.hdf5 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ @@ -17,8 +17,8 @@ singularity exec -B /scratch/persistent --nv \ $IMG python3.10 mlpf/pipeline.py evaluate \ --train-dir $EXPDIR --weights $WEIGHTS -#singularity exec -B /scratch/persistent --nv \ -# --env PYTHONPATH=hep_tfds \ -# --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ -# $IMG python3.10 mlpf/pipeline.py plots \ -# --train-dir $EXPDIR +singularity exec -B /scratch/persistent --nv \ + --env PYTHONPATH=hep_tfds \ + --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ + $IMG python3.10 mlpf/pipeline.py plots \ + --train-dir $EXPDIR diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh index 3217c61b4..1798eab3d 100755 --- a/scripts/tallinn/a100/pytorch.sh +++ b/scripts/tallinn/a100/pytorch.sh @@ -11,5 +11,5 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pyg-cms.yaml \ - --train --conv-type mamba --num-epochs 50 --gpu-batch-multiplier 20 --num-workers 1 --prefetch-factor 10 + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ + --train --conv-type mamba --num-epochs 20 --gpu-batch-multiplier 10 --num-workers 1 --prefetch-factor 10 --ntrain 10000 --nvalid 10000 diff --git a/scripts/tallinn/rtx/clic-train.sh b/scripts/tallinn/rtx/clic-train.sh index f4eba1ed7..57b10a178 100755 --- a/scripts/tallinn/rtx/clic-train.sh +++ b/scripts/tallinn/rtx/clic-train.sh @@ -11,6 +11,6 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python3.10 mlpf/pipeline.py train -c parameters/clic.yaml \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/clic.yaml \ --plot-freq 1 \ --batch-multiplier 0.5 diff --git a/scripts/tallinn/rtx/clic.sh b/scripts/tallinn/rtx/clic.sh new file mode 100755 index 000000000..024936422 --- /dev/null +++ b/scripts/tallinn/rtx/clic.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --partition gpu +#SBATCH --gres gpu:rtx:4 +#SBATCH --mem-per-gpu 40G +#SBATCH -o logs/slurm-%x-%j-%N.out + +IMG=/home/software/singularity/tf-2.14.0.simg +cd ~/particleflow + +#TF training +singularity exec -B /scratch/persistent --nv \ + --env PYTHONPATH=hep_tfds \ + --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/clic.yaml \ + --plot-freq 1 \ + --batch-multiplier 0.5 diff --git a/scripts/tallinn/rtx/delphes-train.sh b/scripts/tallinn/rtx/delphes-train.sh index 67a987722..9019fbe70 100755 --- a/scripts/tallinn/rtx/delphes-train.sh +++ b/scripts/tallinn/rtx/delphes-train.sh @@ -11,6 +11,6 @@ cd ~/particleflow singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python3.10 mlpf/pipeline.py train -c parameters/delphes.yaml \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/delphes.yaml \ --plot-freq 1 \ --batch-multiplier 0.5 diff --git a/scripts/tallinn/rtx/eval.sh b/scripts/tallinn/rtx/eval.sh index 82c650288..c25f7e380 100755 --- a/scripts/tallinn/rtx/eval.sh +++ b/scripts/tallinn/rtx/eval.sh @@ -8,8 +8,8 @@ IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #change these -EXPDIR=experiments/cms-gen_20231019_180157_131092.gpu1.local/ -WEIGHTS=experiments/cms-gen_20231019_180157_131092.gpu1.local/weights/weights-07-0.993207.hdf5 +EXPDIR=experiments/clic_20240119_194512_817807.gpu1.local +WEIGHTS=experiments/clic_20240119_194512_817807.gpu1.local/weights/weights-68-3.200590.hdf5 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ diff --git a/scripts/tallinn/rtx/pytorch.sh b/scripts/tallinn/rtx/pytorch.sh index ebd197ee4..a8f50d7a7 100755 --- a/scripts/tallinn/rtx/pytorch.sh +++ b/scripts/tallinn/rtx/pytorch.sh @@ -6,26 +6,33 @@ IMG=/home/software/singularity/pytorch.simg:2023-12-06 -singularity exec -B /scratch/persistent --nv \ - --env PYTHONPATH=hep_tfds \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pyg-cms-small.yaml \ - --train --test --make-plots --conv-type gnn_lsh --num-epochs 20 --gpu-batch-multiplier 1 --num-workers 1 --prefetch-factor 10 - -singularity exec -B /scratch/persistent --nv \ - --env PYTHONPATH=hep_tfds \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pyg-cms-small.yaml \ - --train --test --make-plots --conv-type gravnet --num-epochs 20 --gpu-batch-multiplier 1 --num-workers 1 --prefetch-factor 10 - -singularity exec -B /scratch/persistent --nv \ - --env PYTHONPATH=hep_tfds \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pyg-cms.yaml \ - --train --test --make-plots --conv-type mamba --num-epochs 20 --gpu-batch-multiplier 5 --num-workers 1 --prefetch-factor 10 - -singularity exec -B /scratch/persistent --nv \ - --env PYTHONPATH=hep_tfds \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pyg-cms.yaml \ - --train --test --make-plots --conv-type attention --num-epochs 20 --gpu-batch-multiplier 5 --num-workers 1 --prefetch-factor 10 +# singularity exec -B /scratch/persistent --nv \ +# --env PYTHONPATH=hep_tfds \ +# $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ +# --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ +# --train --test --make-plots --conv-type gnn_lsh --num-epochs 20 --gpu-batch-multiplier 1 --num-workers 1 --prefetch-factor 10 +# +# singularity exec -B /scratch/persistent --nv \ +# --env PYTHONPATH=hep_tfds \ +# $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ +# --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ +# --train --test --make-plots --conv-type gravnet --num-epochs 20 --gpu-batch-multiplier 1 --num-workers 1 --prefetch-factor 10 +# +# singularity exec -B /scratch/persistent --nv \ +# --env PYTHONPATH=hep_tfds \ +# $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ +# --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ +# --train --test --make-plots --conv-type mamba --num-epochs 50 --gpu-batch-multiplier 5 --num-workers 1 --prefetch-factor 10 +# +# singularity exec -B /scratch/persistent --nv \ +# --env PYTHONPATH=hep_tfds \ +# $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 4 \ +# --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ +# --train --test --make-plots --conv-type attention --num-epochs 20 --gpu-batch-multiplier 5 --num-workers 1 --prefetch-factor 10 +# +# +# singularity exec -B /scratch/persistent --nv \ +# --env PYTHONPATH=hep_tfds \ +# $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ +# --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ +# --test --make-plots --conv-type mamba --gpu-batch-multiplier 5 --num-workers 1 --prefetch-factor 10 --load experiments/pyg-cms_20240126_221457_189384/sub1/best_weights.pth --ntest 1000