Skip to content

Commit

Permalink
towards v1.7: new CMS datasets, CLIC hit-based datasets, TF backward-…
Browse files Browse the repository at this point in the history
…compat optimizations (#285)


* training with cms 1.7.0

* fix postprocessing for new uproot

* track memory usage of pandora

* readd dim decrease options

* optimizer save/restore

* hypertune

* update track feats

* add pytorch training on clic hits

---------

Co-authored-by: Joosep Pata <joosep.pata@kbfi.ee>
  • Loading branch information
jpata and Joosep Pata authored Jan 31, 2024
1 parent bae2907 commit d335cd3
Show file tree
Hide file tree
Showing 99 changed files with 2,418 additions and 1,229 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ on:
jobs:
lint:
name: Lint PR or Push to main
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.8.10]
python-version: [3.10.12]

steps:
- name: Checkout
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
deps-pyg:
Expand All @@ -25,7 +25,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: pip3 install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
Expand All @@ -38,7 +38,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: PYTHONPATH=. python3 -m unittest tests/test_tf.py
Expand All @@ -50,7 +50,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: ./scripts/local_test_clic_pipeline.sh
Expand All @@ -62,7 +62,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: ./scripts/local_test_clic_hits_pipeline.sh
Expand All @@ -74,7 +74,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: ./scripts/local_test_delphes_pipeline.sh
Expand All @@ -86,7 +86,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: ./scripts/local_test_cms_pipeline.sh
Expand All @@ -98,7 +98,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: pip3 install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
Expand All @@ -112,7 +112,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10.12"
cache: "pip"
- run: pip install -r requirements.txt
- run: pip3 install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
Expand Down
4 changes: 2 additions & 2 deletions mlpf/data_cms/genjob_nopu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ pwd
ls -lrt

echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py
cmsRun step2_phase1_new.py
cmsRun step3_phase1_new.py
cmsRun step2_phase1_new.py > /dev/null
cmsRun step3_phase1_new.py > /dev/null
#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
mv pfntuple.root pfntuple_${SEED}.root
python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
Expand Down
4 changes: 2 additions & 2 deletions mlpf/data_cms/genjob_pu55to75.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ pwd
ls -lrt

echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py
cmsRun step2_phase1_new.py
cmsRun step3_phase1_new.py
cmsRun step2_phase1_new.py > /dev/null
cmsRun step3_phase1_new.py > /dev/null
#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
mv pfntuple.root pfntuple_${SEED}.root
python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
Expand Down
18 changes: 10 additions & 8 deletions mlpf/data_cms/prepare_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
("TTbar_14TeV_TuneCUETP8M1_cfi", 700000, 701000, "genjob_nopu.sh", outdir + "/nopu"),
("MultiParticlePFGun50_cfi", 800000, 810000, "genjob_nopu.sh", outdir + "/nopu"),

("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 901000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1001000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1101000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1201000, "genjob_nopu.sh", outdir + "/nopu"),
("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1301000, "genjob_nopu.sh", outdir + "/nopu"),
("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1401000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1501000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleTauFlatPt1To1000_cfi", 1600000,1601000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 910000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1010000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1110000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1210000, "genjob_nopu.sh", outdir + "/nopu"),
("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1310000, "genjob_nopu.sh", outdir + "/nopu"),
("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1410000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1510000, "genjob_nopu.sh", outdir + "/nopu"),
("SingleTauFlatPt1To1000_cfi", 1600000,1610000, "genjob_nopu.sh", outdir + "/nopu"),

("VBF_TuneCP5_14TeV_pythia8_cfi", 1700000,1705010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
]

if __name__ == "__main__":
Expand Down
78 changes: 78 additions & 0 deletions mlpf/heptfds/clic_pf_edm4hep/single_gamma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from pathlib import Path

import tensorflow as tf
from utils_edm import (
X_FEATURES_CL,
X_FEATURES_TRK,
Y_FEATURES,
generate_examples,
split_sample,
)

import tensorflow_datasets as tfds
import numpy as np

_DESCRIPTION = """
CLIC EDM4HEP dataset with single gamma particle gun
- X: reconstructed tracks and clusters, variable number N per event
- ygen: stable generator particles, zero-padded to N per event
- ycand: baseline particle flow particles, zero-padded to N per event
"""

_CITATION = """
Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023).
Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set].
Zenodo. https://doi.org/10.5281/zenodo.8260741
"""


class ClicEdmSingleGammaPf(tfds.core.GeneratorBasedBuilder):
VERSION = tfds.core.Version("1.5.0")
RELEASE_NOTES = {
"1.5.0": "Regenerate with ARRAY_RECORD",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
For the raw input files in ROOT EDM4HEP format, please see the citation above.
The processed tensorflow_dataset can also be downloaded from:
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./
"""

def __init__(self, *args, **kwargs):
kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD
super(ClicEdmSingleGammaPf, self).__init__(*args, **kwargs)

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict(
{
"X": tfds.features.Tensor(
shape=(
None,
max(len(X_FEATURES_TRK), len(X_FEATURES_CL)),
),
dtype=tf.float32,
),
"ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32),
"ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32),
}
),
supervised_keys=None,
homepage="",
citation=_CITATION,
metadata=tfds.core.MetadataDict(
x_features_track=X_FEATURES_TRK,
x_features_cluster=X_FEATURES_CL,
y_features=Y_FEATURES,
),
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "gamma/"))

def _generate_examples(self, files):
return generate_examples(files)
78 changes: 78 additions & 0 deletions mlpf/heptfds/clic_pf_edm4hep/single_kaon0L.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from pathlib import Path

import tensorflow as tf
from utils_edm import (
X_FEATURES_CL,
X_FEATURES_TRK,
Y_FEATURES,
generate_examples,
split_sample,
)

import tensorflow_datasets as tfds
import numpy as np

_DESCRIPTION = """
CLIC EDM4HEP dataset with single kaon0L particle gun
- X: reconstructed tracks and clusters, variable number N per event
- ygen: stable generator particles, zero-padded to N per event
- ycand: baseline particle flow particles, zero-padded to N per event
"""

_CITATION = """
Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023).
Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set].
Zenodo. https://doi.org/10.5281/zenodo.8260741
"""


class ClicEdmSingleKaon0lPf(tfds.core.GeneratorBasedBuilder):
VERSION = tfds.core.Version("1.5.0")
RELEASE_NOTES = {
"1.5.0": "Regenerate with ARRAY_RECORD",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
For the raw input files in ROOT EDM4HEP format, please see the citation above.
The processed tensorflow_dataset can also be downloaded from:
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./
"""

def __init__(self, *args, **kwargs):
kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD
super(ClicEdmSingleKaon0lPf, self).__init__(*args, **kwargs)

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict(
{
"X": tfds.features.Tensor(
shape=(
None,
max(len(X_FEATURES_TRK), len(X_FEATURES_CL)),
),
dtype=tf.float32,
),
"ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32),
"ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32),
}
),
supervised_keys=None,
homepage="",
citation=_CITATION,
metadata=tfds.core.MetadataDict(
x_features_track=X_FEATURES_TRK,
x_features_cluster=X_FEATURES_CL,
y_features=Y_FEATURES,
),
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample(Path(path / "kaon0L/"))

def _generate_examples(self, files):
return generate_examples(files)
78 changes: 78 additions & 0 deletions mlpf/heptfds/clic_pf_edm4hep/single_pi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from pathlib import Path

import tensorflow as tf
from utils_edm import (
X_FEATURES_CL,
X_FEATURES_TRK,
Y_FEATURES,
generate_examples,
split_sample_several,
)

import tensorflow_datasets as tfds
import numpy as np

_DESCRIPTION = """
CLIC EDM4HEP dataset with single-pion particle gun
- X: reconstructed tracks and clusters, variable number N per event
- ygen: stable generator particles, zero-padded to N per event
- ycand: baseline particle flow particles, zero-padded to N per event
"""

_CITATION = """
Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023).
Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set].
Zenodo. https://doi.org/10.5281/zenodo.8260741
"""


class ClicEdmSinglePiPf(tfds.core.GeneratorBasedBuilder):
VERSION = tfds.core.Version("1.5.0")
RELEASE_NOTES = {
"1.5.0": "Regenerate with ARRAY_RECORD",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
For the raw input files in ROOT EDM4HEP format, please see the citation above.
The processed tensorflow_dataset can also be downloaded from:
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./
"""

def __init__(self, *args, **kwargs):
kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD
super(ClicEdmSinglePiPf, self).__init__(*args, **kwargs)

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict(
{
"X": tfds.features.Tensor(
shape=(
None,
max(len(X_FEATURES_TRK), len(X_FEATURES_CL)),
),
dtype=tf.float32,
),
"ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32),
"ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=np.float32),
}
),
supervised_keys=None,
homepage="",
citation=_CITATION,
metadata=tfds.core.MetadataDict(
x_features_track=X_FEATURES_TRK,
x_features_cluster=X_FEATURES_CL,
y_features=Y_FEATURES,
),
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.manual_dir
return split_sample_several([Path(path / "pi-/"), Path(path / "pi+/")])

def _generate_examples(self, files):
return generate_examples(files)
Loading

0 comments on commit d335cd3

Please sign in to comment.