diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e870ef5d2..131c10ac4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,32 +9,6 @@ on:
   workflow_dispatch:
 
 jobs:
-  delphes-tf:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install python deps
-        run: |
-          sudo apt install python3 python3-pip wget
-          sudo python3 -m pip install --upgrade pip
-          sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
-      - name: Run delphes TF model
-        run: ./scripts/local_test_delphes_tf.sh
-
-  cms-tf:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install python deps
-        run: |
-          sudo apt install python3 python3-pip wget
-          sudo python3 -m pip install --upgrade pip
-          sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
-      - name: Run CMS TF model
-        run: ./scripts/local_test_cms_tf.sh
-
   delphes-pipeline:
     runs-on: ubuntu-latest
     steps:
@@ -44,7 +18,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_pipeline.sh
 
@@ -57,7 +31,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
 
diff --git a/.gitignore b/.gitignore
index 96659e5c9..56c75e84c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,12 @@ mlpf/pytorch/data
 test_tmp/
 test_tmp_delphes/
 .DS_Store
+
+prp
+*.pyc
+*.pyo
+
+mlpf/updated/LRP/pid*
+mlpf/updated/LRP/class*
+
+*.ipynb_checkpoints
diff --git a/README.md b/README.md
index 66fc7e6aa..d29529727 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 Short instructions with a single test file in [notebooks/delphes-tf-mlpf-quickstart.ipynb](notebooks/delphes-tf-mlpf-quickstart.ipynb).
 
 Long instructions for reproducing the full training from scratch in [README_delphes.md](README_delphes.md).
-The plots can be generated using the notebook [delphes/resolution_checks.ipynb](delphes/resolution_checks.ipynb).
+The plots can be generated using the notebook [delphes/delphes_model_analysis.ipynb](delphes/delphes_model_analysis.ipynb).
 
 ### Delphes dataset
 The dataset is available from zenodo: https://doi.org/10.5281/zenodo.4559324.
diff --git a/README_cms.md b/README_cms.md
index 83a7574ff..f3bb90f90 100644
--- a/README_cms.md
+++ b/README_cms.md
@@ -33,5 +33,5 @@ git clone https://github.com/jpata/particleflow.git
 cd particleflow
 
 #run a small local test including data prep and training
-./scripts/local_test_cms_tf.sh
+./scripts/local_test_cms_pipeline.sh
 ```
diff --git a/README_delphes.md b/README_delphes.md
index e6a6ea282..89fe9b6c6 100644
--- a/README_delphes.md
+++ b/README_delphes.md
@@ -2,28 +2,55 @@
 
 The following instructions use singularity, but you may have a different local setup.
 
+```bash
+#Download all pkl.bz2 files from https://zenodo.org/record/4559324
+
+#now move the data into the right place
+mv *pythia8_qcd*.pkl.bz2 data/pythia8_qcd/val
+mv *pythia8_ttbar*.pkl.bz2 data/pythia8_qcd/raw
+mv data/pythia8_qcd/raw/*pythia8_ttbar_9_*.pkl.bz2 data/pythia8_qcd/val
+
+# Generate the TFRecord datasets needed for larger-than-RAM training
+python3 mlpf/pipeline.py data -c parameters/delphes.yaml
+
+# Run the training of the base GNN model using e.g. 5 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3,4 python3 mlpf/pipeline.py train -c parameters/delphes.yaml
+
+#Run the validation to produce the predictions file
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_* -v "data/pythia8_qcd/val/*.pkl.bz2" -e evaluate_qcd
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_* -v "data/pythia8_ttbar/val/*.pkl.bz2" -e evaluate_ttbar
+```
+
+## Recipe for generation
+The Delphes AngularSmearing module has been modified to correctly take into account the smearing for tracks, see [delphes/install.sh](delphes/install.sh).
+
+```bash
+wget http://atlaswww.hep.anl.gov/hepsim/soft/centos7hepsim.img
+sudo singularity build --sandbox centos7hepsim.sandbox centos7hepsim.img
+sudo singularity exec -B /home --writable centos7hepsim.sandbox ./install.sh
+sudo singularity build centos7hepsim.sif centos7hepsim.sandbox
+sudo rm -Rf centos7hepsim.sandbox
+```
+
 ```bash
 cd delphes
 
 # Run the simulation step
 # Generate events with pythia, mix them with PU and run a detector simulation using Delphes
-singularity exec http://jpata.web.cern.ch/jpata/centos7hepsim.sif ./run_sim.sh
+singularity exec centos7hepsim.sif ./run_sim.sh
 
 # Run the ntuplization step
 # generate X,y input matrices for NN training in out/pythia8_ttbar/*.pkl.bz2
-singularity exec http://jpata.web.cern.ch/jpata/centos7hepsim.sif ./run_ntuple.sh
-singularity exec http://jpata.web.cern.ch/jpata/centos7hepsim.sif ./run_ntuple_qcd.sh
+singularity exec centos7hepsim.sif ./run_ntuple.sh
+singularity exec centos7hepsim.sif ./run_ntuple_qcd.sh
 
-#Alternatively, to skip run_sim.sh and run_ntuple.sh, download everything from https://doi.org/10.5281/zenodo.4452283 and put into out/pythia8_ttbar
-
-#now move the data into the right place
 mv out/pythia8_ttbar ../data/
 cd ../data/pythia8_ttbar
 mkdir raw
 mkdir val
 mkdir root
 mv *.root root/
-mb *.promc root/
+mv *.promc root/
 mv *.pkl.bz2 raw/
 cd ../..
 
@@ -35,26 +62,4 @@ mv *.root root/
 mv *.promc root/
 mv *.pkl.bz2 val/
 cd ../..
-
-# Generate the TFRecord datasets needed for larger-than-RAM training
-singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action data --model-spec parameters/delphes-gnn-skipconn.yaml
-
-# Run the training of the base GNN model using e.g. 5 GPUs in a data-parallel mode
-CUDA_VISIBLE_DEVICES=0,1,2,3,4 singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action train --model-spec parameters/delphes-gnn-skipconn.yaml
-
-#Run the validation to produce the predictions file
-singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action eval --model-spec parameters/delphes-gnn-skipconn.yaml --weights ./experiments/delphes-gnn-skipconn-*/weights-300-*.hdf5
-
-singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action time --model-spec parameters/delphes-gnn-skipconn.yaml --weights ./experiments/delphes-gnn-skipconn-*/weights-300-*.hdf5
-```
-
-## Recipe to prepare Delphes singularity image
-NB: The Delphes AngularSmearing module has been modified to correctly take into account the smearing for tracks, see [delphes/install.sh](delphes/install.sh)
-
-```bash
-wget http://atlaswww.hep.anl.gov/hepsim/soft/centos7hepsim.img
-sudo singularity build --sandbox centos7hepsim.sandbox centos7hepsim.img
-sudo singularity exec -B /home --writable centos7hepsim.sandbox ./install.sh
-sudo singularity build centos7hepsim.sif centos7hepsim.sandbox
-sudo rm -Rf centos7hepsim.sandbox
 ```
diff --git a/clic/dumper.py b/clic/dumper.py
index 15307f354..6e1b523c4 100644
--- a/clic/dumper.py
+++ b/clic/dumper.py
@@ -61,7 +61,8 @@ def pfParticleToDict(par):
         "px": mom[0],
         "py": mom[1],
         "pz": mom[2],
-        "energy": par.getEnergy()
+        "energy": par.getEnergy(),
+        "charge": par.getCharge()
     }
     return vec
 
@@ -210,6 +211,7 @@ def caloHitToDict(par, calohit_to_cluster, genparticle_dict, calohit_recotosim):
         nPF=colPF.getNumberOfElements()
         nCl=colCl.getNumberOfElements()
         nTr=colTr.getNumberOfElements()
+        nHit=simTrackHits.getNumberOfElements()
         nHCB=colHCB.getNumberOfElements()
         nHCE=colHCE.getNumberOfElements()
         nECB=colECB.getNumberOfElements()
@@ -223,7 +225,7 @@ def caloHitToDict(par, calohit_to_cluster, genparticle_dict, calohit_recotosim):
             assert(not (recohit in calohit_recotosim))
             calohit_recotosim[recohit] = simhit
  
-        print "Event %d, nGen=%d, nPF=%d, nClusters=%d, nTracks=%d, nHCAL=%d, nECAL=%d" % (nEvent, nMc, nPF, nCl, nTr, nHCB+nHCE, nECB+nECE)
+        print "Event %d, nGen=%d, nPF=%d, nClusters=%d, nTracks=%d, nHCAL=%d, nECAL=%d, nHits=%d" % (nEvent, nMc, nPF, nCl, nTr, nHCB+nHCE, nECB+nECE, nHit)
     
         genparticles = []
         genparticle_dict = {}
diff --git a/mlpf/adv_training.py b/mlpf/adv_training.py
index b63474906..940e067d9 100644
--- a/mlpf/adv_training.py
+++ b/mlpf/adv_training.py
@@ -4,31 +4,49 @@
 import glob
 import random
 
+from tqdm import tqdm
 from tfmodel.model_setup import make_model, targets_multi_output, CustomCallback
 from tfmodel.data import Dataset
 
+#A deep sets conditional discriminator
 def make_disc_model(config, reco_features):
     input_elems = tf.keras.layers.Input(shape=(config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
     input_reco = tf.keras.layers.Input(shape=(config["dataset"]["padded_num_elem_size"], reco_features))
-    da1 = tf.keras.layers.Dense(128, activation="elu")(input_elems)
-    da2 = tf.keras.layers.Dense(128, activation="elu")(da1)
-    da3 = tf.keras.layers.Dense(128, activation="elu")(da2)
-    db1 = tf.keras.layers.Dense(128, activation="elu")(input_reco)
-    db2 = tf.keras.layers.Dense(128, activation="elu")(db1)
-    db3 = tf.keras.layers.Dense(128, activation="elu")(db2)
+
+    nhidden = 512
+    #process the input elements
+    da1 = tf.keras.layers.Dense(nhidden, activation="elu")(input_elems)
+    da2 = tf.keras.layers.Dense(nhidden, activation="elu")(da1)
+    da3 = tf.keras.layers.Dense(nhidden, activation="elu")(da2)
+
+    #process the target reco particles
+    db1 = tf.keras.layers.Dense(nhidden, activation="elu")(input_reco)
+    db2 = tf.keras.layers.Dense(nhidden, activation="elu")(db1)
+    db3 = tf.keras.layers.Dense(nhidden, activation="elu")(db2)
+
+    #concatenate the input element and reco target 
     c = tf.keras.layers.Concatenate()([da3, db3])
-    dc1 = tf.keras.layers.Dense(128, activation="elu")(c)
-    dc2 = tf.keras.layers.Dense(128, activation="elu")(dc1)
-
-    sc = tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x, axis=-2))(dc2)
-    c1 = tf.keras.layers.Dense(128, activation="elu")(sc)
-    c2 = tf.keras.layers.Dense(128, activation="elu")(c1)
-    c3 = tf.keras.layers.Dense(1, activation="linear")(c2)
-    model_disc = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[c3])
+
+    #encode the (element, target) pairs using a feedforward net
+    dc1 = tf.keras.layers.Dense(nhidden, activation="elu")(c)
+    dc2 = tf.keras.layers.Dense(nhidden/2, activation="elu")(dc1)
+
+    #sum across the encoded (element, target) pairs in the event to create an event encoding
+    msk = tf.keras.layers.Lambda(lambda x: tf.cast(x[:, :, 0:1]!=0, tf.float32))(input_elems)
+    sc = tf.keras.layers.Lambda(lambda args: tf.reduce_sum(args[0]*args[1], axis=-2))([dc2, msk])
+
+    #classify the embedded event as real (true target) or fake (MLPF reconstructed)
+    c1 = tf.keras.layers.Dense(nhidden/2, activation="elu")(sc)
+    c2 = tf.keras.layers.Dense(nhidden/4, activation="elu")(c1)
+    c3 = tf.keras.layers.Dense(nhidden/8, activation="elu")(c2)
+
+    #classification output
+    c4 = tf.keras.layers.Dense(1, activation="linear")(c3)
+    model_disc = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[c4])
     return model_disc 
 
 def concat_pf(ypred):
-    return tf.concat([ypred["cls"], ypred["charge"], ypred["pt"], ypred["eta"], ypred["sin_phi"], ypred["cos_phi"], ypred["energy"]], axis=-1)
+    return tf.concat([tf.keras.activations.softmax(ypred["cls"]*10), ypred["charge"], ypred["pt"], ypred["eta"], ypred["sin_phi"], ypred["cos_phi"], ypred["energy"]], axis=-1)
 
 def main(config):
     tf.config.run_functions_eagerly(False)
@@ -51,10 +69,9 @@ def main(config):
 
     x = np.random.randn(1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])
     ypred = concat_pf(model_pf(x))
-    model_pf.load_weights("./experiments/cms-gnn-dense-lite-e0108f63.gpu0.local/weights-254-41.512947.hdf5")
+    model_pf.load_weights("./experiments/cms-gnn-dense-dev_20210814_123346.gpu0.local/weights/weights-100-195.351807.hdf5")
 
     model_disc = make_disc_model(config, ypred.shape[-1])
-    model_disc.summary()
 
     cds = config["dataset"]
     dataset_def = Dataset(
@@ -78,7 +95,7 @@ def main(config):
     ycand_val = np.concatenate(ycands)
 
     dataset_transform = targets_multi_output(config['dataset']['num_output_classes'])
-    cb = CustomCallback("logs", X_val, ycand_val, dataset_transform, config['dataset']['num_output_classes'])
+    cb = CustomCallback("logs", X_val, ycand_val, dataset_transform, config['dataset']['num_output_classes'], freq=1)
     cb.set_model(model_pf)
 
     tfr_files = sorted(glob.glob(dataset_def.processed_path))
@@ -91,9 +108,9 @@ def main(config):
         tf.TensorShape([dataset_def.padded_num_elem_size, ])
     )
 
-    n_train = 1000
+    n_train = 10000
     n_test = 1000
-    batch_size = 2
+    batch_size = 4
 
     ds_train = dataset.take(n_train).padded_batch(batch_size, padded_shapes=ps)
     ds_test = dataset.skip(n_train).take(n_test).padded_batch(batch_size, padded_shapes=ps)
@@ -109,45 +126,51 @@ def main(config):
     m1 = tf.keras.models.Model(inputs=[input_elems], outputs=[disc_out1])
     m2 = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[disc_out2])
 
-    optimizer1 = tf.keras.optimizers.Adam(lr=1e-5)
+    def loss(x,y):
+        return tf.keras.losses.binary_crossentropy(x,y, from_logits=True)
+
+    #The MLPF reconstruction model (generator) is optimized to confuse the classifier
+    optimizer1 = tf.keras.optimizers.Adam(lr=1e-6)
     model_pf.trainable = True
     model_disc.trainable = False
-    m1.compile(loss=lambda x,y: tf.keras.losses.binary_crossentropy(x,y, from_logits=True), optimizer=optimizer1)
+    m1.compile(loss=loss, optimizer=optimizer1)
+    m1.summary()
 
-    optimizer2 = tf.keras.optimizers.Adam(lr=1e-5)
+    #The discriminator model (adversarial) is optimized to distinguish between the true target and MLPF-reconstructed events
+    optimizer2 = tf.keras.optimizers.Adam(lr=1e-4)
     model_pf.trainable = False
     model_disc.trainable = True
-    m2.compile(loss=lambda x,y: tf.keras.losses.binary_crossentropy(x,y, from_logits=True), optimizer=optimizer2)
+    m2.compile(loss=loss, optimizer=optimizer2)
+    m2.summary()
 
-    epochs = 100
-    from itertools import cycle
-    disc_or_pf = 2*[True] + 8*[False]
-    disc_or_pf = cycle(disc_or_pf)
+    epochs = 1000
 
     for epoch in range(epochs):
-
         loss_tot1 = 0.0
         loss_tot2 = 0.0
         loss_tot1_test = 0.0
         loss_tot2_test = 0.0
 
-        train_disc = next(disc_or_pf)
-        for step, (xb, yb, _) in enumerate(ds_train):
+
+        for step, (xb, yb, wb) in tqdm(enumerate(ds_train), desc="Training"):
 
             yp = concat_pf(model_pf(xb))
-            xb = tf.concat([xb, xb], axis=0)
             yid = tf.one_hot(tf.cast(yb[:, :, 0], tf.int32), cds["num_output_classes"])
             yb = tf.concat([yid, yb[:, :, 1:]], axis=-1)
             yb = tf.concat([yb, yp], axis=0)
+
+            #Train the MLPF reconstruction (generative) model with an inverted target
+            yt = tf.concat([batch_size*[1]], axis=0)
+            loss1 = m1.train_on_batch(xb, yt)
+
+            xb = tf.concat([xb, xb], axis=0)
+            #Train the discriminative (adversarial) model
+            #true target particles have a classification target of 1, MLPF reconstructed a target of 0
             yt = tf.concat([batch_size*[1], batch_size*[0]], axis=0)
+            loss2 = m2.train_on_batch([xb, yb], yt)
 
-            if train_disc:
-                loss_tot1 += m1.test_on_batch(xb, yt)
-                loss_tot2 += m2.train_on_batch([xb, yb], yt)
-            else:
-                yt = tf.concat([batch_size*[1], batch_size*[1]], axis=0)
-                loss_tot1 += m1.train_on_batch(xb, yt)
-                loss_tot2 += m2.test_on_batch([xb, yb], yt)
+            loss_tot1 += loss1
+            loss_tot2 += loss2
 
         import boost_histogram as bh
         import mplhep
@@ -156,23 +179,29 @@ def main(config):
         preds_0 = []
         preds_1 = []
 
-        for step, (xb, yb, _) in enumerate(ds_test):
+        for step, (xb, yb, wb) in tqdm(enumerate(ds_test), desc="Testing"):
             yp = concat_pf(model_pf(xb))
-            xb = tf.concat([xb, xb], axis=0)
             yid = tf.one_hot(tf.cast(yb[:, :, 0], tf.int32), cds["num_output_classes"])
             yb = tf.concat([yid, yb[:, :, 1:]], axis=-1)
             yb = tf.concat([yb, yp], axis=0)
-            yt = tf.concat([batch_size*[1], batch_size*[0]], axis=0)
 
-            loss_tot1_test += m1.test_on_batch(xb, yt)
-            loss_tot2_test += m2.test_on_batch([xb, yb], yt)
+            yt = tf.concat([batch_size*[1]], axis=0)
+            loss1 = m1.test_on_batch(xb, yt)
+
+            xb = tf.concat([xb, xb], axis=0)
+            yt = tf.concat([batch_size*[1], batch_size*[0]], axis=0)
+            loss2 = m2.test_on_batch([xb, yb], yt)
 
             p = m2.predict_on_batch([xb, yb])
             preds_0 += list(p[yt==0, 0])
             preds_1 += list(p[yt==1, 0])
 
+            loss_tot1_test += loss1
+            loss_tot2_test += loss2
+
         print("Epoch {}, l1={:.5E}/{:.5E}, l2={:.5E}/{:.5E}".format(epoch, loss_tot1, loss_tot1_test, loss_tot2, loss_tot2_test))
 
+        #Draw histograms of the discriminator outputs for monitoring
         minval = np.min(preds_0 + preds_1)
         maxval = np.max(preds_0 + preds_1)
         h0 = bh.Histogram(bh.axis.Regular(50, minval, maxval))
@@ -181,8 +210,10 @@ def main(config):
         h1.fill(preds_1)
 
         fig = plt.figure(figsize=(4,4))
-        mplhep.histplot(h0)
-        mplhep.histplot(h1)
+        mplhep.histplot(h0, label="MLPF")
+        mplhep.histplot(h1, label="Target")
+        plt.xlabel("Adversarial classification output")
+        plt.legend(loc="best", frameon=False)
         plt.savefig("logs/disc_{}.pdf".format(epoch), bbox_inches="tight")
         plt.close("all")
 
@@ -195,5 +226,5 @@ def main(config):
         cb.on_epoch_end(epoch)
 
 if __name__ == "__main__":
-    config = yaml.load(open("parameters/cms-gnn-dense-lite.yaml"))
+    config = yaml.load(open("parameters/cms-gnn-dense-dev.yaml"))
     main(config)
\ No newline at end of file
diff --git a/mlpf/flatiron/find_lr_4GPUs.slurm b/mlpf/flatiron/find_lr_4GPUs.slurm
new file mode 100644
index 000000000..ebf1bb569
--- /dev/null
+++ b/mlpf/flatiron/find_lr_4GPUs.slurm
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+#SBATCH -t 1:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J find_lr
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py find-lr -c $1
+
+cp lr_finder.jpg $SLURM_SUBMIT_DIR/
diff --git a/mlpf/flatiron/hypertune.slurm b/mlpf/flatiron/hypertune.slurm
new file mode 100644
index 000000000..4c9596f73
--- /dev/null
+++ b/mlpf/flatiron/hypertune.slurm
@@ -0,0 +1,73 @@
+#!/bin/sh
+
+#SBATCH -t 168:00:00
+#SBATCH -N 4
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --constraint=a100,sxm4
+#SBATCH --gpus-per-task=4
+
+# Job name
+#SBATCH -J hypertune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+# Getting the node hostnames
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+echo $nodes
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+sleep 5
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    tunerID="tuner$i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+    sleep 1
+done
+wait # keep the wait statement, it is important
+echo "Done."
diff --git a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
new file mode 100644
index 000000000..d3b2e6978
--- /dev/null
+++ b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 03:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 1
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J pipeeval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+python3 tf_list_gpus.py
+
+echo 'Starting evaluation.'
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
+echo 'Evaluation done.'
diff --git a/mlpf/flatiron/pipeline_train_4GPUs.slurm b/mlpf/flatiron/pipeline_train_4GPUs.slurm
new file mode 100644
index 000000000..cbdb21308
--- /dev/null
+++ b/mlpf/flatiron/pipeline_train_4GPUs.slurm
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 168:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus-per-task=4
+#SBATCH --constraint=a100,sxm4
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+mkdir experiments
+
+python3 tf_list_gpus.py
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2
+echo 'Training done.'
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
new file mode 100755
index 000000000..6dfbb71bb
--- /dev/null
+++ b/mlpf/flatiron/raytune.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+#SBATCH -t 168:00:00
+#SBATCH -N 4
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --constraint=a100,sxm4
+#SBATCH --gpus-per-task=4
+#SBATCH --cpus-per-task=64
+
+# Job name
+#SBATCH -J raytune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+export TUNE_RESULT_DIR="/mnt/ceph/users/ewulff/ray_results/tune_result_dir"
+export TUNE_MAX_PENDING_TRIALS_PG=${SLURM_NNODES}
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
+# This script is a modification to the implementation suggest by gregSchwartz18 here:
+# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
+redis_password=$(uuidgen)
+export redis_password
+echo "Redis password: ${redis_password}"
+
+nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
+nodes_array=( $nodes )
+
+node_1=${nodes_array[0]} 
+ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
+port=6379
+ip_head=$ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "STARTING HEAD at $node_1"
+srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip $SLURM_JOB_ID $2 &
+sleep 30
+
+worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
+for ((  i=1; i<=$worker_num; i++ ))
+do
+  node_i=${nodes_array[$i]}
+  echo "STARTING WORKER $i at $node_i"
+  srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head $SLURM_JOB_ID $i $2 &
+  sleep 5
+done
+##############################################################################################
+
+#### call your code below
+python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" --gpus "${SLURM_GPUS_PER_TASK}"
+exit
diff --git a/mlpf/flatiron/start-head.sh b/mlpf/flatiron/start-head.sh
new file mode 100755
index 000000000..6849d1d08
--- /dev/null
+++ b/mlpf/flatiron/start-head.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray head node"
+# Launch the head node
+mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2"
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" &
+ray start --head --node-ip-address=$1 --port=6379
+sleep infinity
diff --git a/mlpf/flatiron/start-worker.sh b/mlpf/flatiron/start-worker.sh
new file mode 100755
index 000000000..14c8951b4
--- /dev/null
+++ b/mlpf/flatiron/start-worker.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray worker node"
+mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2"
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" &
+ray start --address $1
+sleep infinity
diff --git a/mlpf/flatiron/train_4GPUs.slurm b/mlpf/flatiron/train_4GPUs.slurm
new file mode 100644
index 000000000..6b9cfc3e3
--- /dev/null
+++ b/mlpf/flatiron/train_4GPUs.slurm
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 72:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J train
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1
+echo 'Training done.'
+ls -l experiments
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
diff --git a/mlpf/flatiron/validate_4GPUs.slurm b/mlpf/flatiron/validate_4GPUs.slurm
new file mode 100644
index 000000000..566a2d8a9
--- /dev/null
+++ b/mlpf/flatiron/validate_4GPUs.slurm
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 8:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+
+# Job name
+#SBATCH -J eval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude=".git" . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+echo 'Starting validation.'
+#Run the validation to produce the predictions file
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action eval --model-spec parameters/delphes-gnn-skipconn.yaml --weights $1
+echo 'Valdiation done.'
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
diff --git a/mlpf/hypertune_scripts/run_chief.sh b/mlpf/hypertune_scripts/run_chief.sh
new file mode 100755
index 000000000..f91a8512f
--- /dev/null
+++ b/mlpf/hypertune_scripts/run_chief.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+export KERASTUNER_TUNER_ID=$1
+export KERASTUNER_ORACLE_IP=$2
+export KERASTUNER_ORACLE_PORT=$3
+echo "KERASTUNER_TUNER_ID:"
+echo $KERASTUNER_TUNER_ID
+echo "KERASTUNER_ORACLE_IP:"
+echo $KERASTUNER_ORACLE_IP
+echo "KERASTUNER_ORACLE_PORT:"
+echo $KERASTUNER_ORACLE_PORT
+
+
+nvidia-smi
+echo 'Starting chief.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c $4 -o $5
+echo 'Chief done.'
\ No newline at end of file
diff --git a/mlpf/hypertune_scripts/run_tuner.sh b/mlpf/hypertune_scripts/run_tuner.sh
new file mode 100755
index 000000000..efcf648ae
--- /dev/null
+++ b/mlpf/hypertune_scripts/run_tuner.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+export KERASTUNER_TUNER_ID=$1
+export KERASTUNER_ORACLE_IP=$2
+export KERASTUNER_ORACLE_PORT=$3
+echo "KERASTUNER_TUNER_ID:"
+echo $KERASTUNER_TUNER_ID
+echo "KERASTUNER_ORACLE_IP:"
+echo $KERASTUNER_ORACLE_IP
+echo "KERASTUNER_ORACLE_PORT:"
+echo $KERASTUNER_ORACLE_PORT
+
+
+nvidia-smi
+echo 'Starting tuner.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c $4 -o $5
+echo 'Tuner done.'
diff --git a/mlpf/juwels/hypertune.slurm b/mlpf/juwels/hypertune.slurm
new file mode 100644
index 000000000..ecacda15f
--- /dev/null
+++ b/mlpf/juwels/hypertune.slurm
@@ -0,0 +1,79 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 23:59:59
+#SBATCH --nodes 4
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+
+# Job name
+#SBATCH -J hypertune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+jutil env activate -p prcoe12
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+
+# Getting the node hostnames
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+echo "Using nodes:"
+echo $nodes
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/p/project/prcoe12/wulff1/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+sleep 5
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    tunerID="tuner$i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/p/project/prcoe12/wulff1/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+    sleep 1
+done
+wait # keep the wait statement, it is important!
+echo "Done."
diff --git a/mlpf/juwels/pipeline_evaluate.slurm b/mlpf/juwels/pipeline_evaluate.slurm
new file mode 100644
index 000000000..fd2beb45b
--- /dev/null
+++ b/mlpf/juwels/pipeline_evaluate.slurm
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 01:59:59
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+# Job name
+#SBATCH -J pipeeval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0
+
+jutil env activate -p prcoe12
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+python3 tf_list_gpus.py
+echo 'Starting evaluation.'
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
+echo 'Evaluation done.'
+
+rsync -a experiments/ $SLURM_SUBMIT_DIR/experiments/
diff --git a/mlpf/juwels/pipeline_train.slurm b/mlpf/juwels/pipeline_train.slurm
new file mode 100644
index 000000000..7482264ca
--- /dev/null
+++ b/mlpf/juwels/pipeline_train.slurm
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 23:59:59
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+jutil env activate -p prcoe12
+
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+mkdir $SCRATCH/particleflow
+rsync -ar --exclude={".git","experiments"} . $SCRATCH/particleflow/
+cd $SCRATCH/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+echo 'Starting training.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2
+echo 'Training done.'
+
+rsync -a experiments/ $SLURM_SUBMIT_DIR/experiments/
diff --git a/mlpf/juwels/train_mlpf.slurm b/mlpf/juwels/train_mlpf.slurm
new file mode 100644
index 000000000..bdd06723c
--- /dev/null
+++ b/mlpf/juwels/train_mlpf.slurm
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 23:59:59
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+# Job name
+#SBATCH -J mlpf_tr
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+jutil env activate -p prcoe12
+
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+mkdir $SCRATCH/particleflow
+rsync -ar --exclude=".git" . $SCRATCH/particleflow/
+cd $SCRATCH/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+echo 'Starting training.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1
+echo 'Training done.'
+
+rsync -a experiments/ $SLURM_SUBMIT_DIR/experiments/
diff --git a/mlpf/launcher.py b/mlpf/launcher.py
deleted file mode 100644
index 4f2eb0674..000000000
--- a/mlpf/launcher.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import yaml
-import tfmodel
-import tfmodel.model_setup
-
-def load_config(yaml_path):
-    with open(yaml_path) as f:
-        config = yaml.load(f)
-    return config
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-spec", type=str, default="parameters/delphes-gnn-skipconn.yaml", help="the model specification")
-    parser.add_argument("--action", type=str, choices=["data", "train", "eval", "time"], help="Run training, validation or timing", default="train")
-    parser.add_argument("--modifier", type=str, choices=["retrain_energy", None], help="Apply a modification on the standard training", default=None)
-    parser.add_argument("--weights", type=str, help="weight file to load", default=None)
-    parser.add_argument("--ntrain", type=int, help="override the number of training events", default=None)
-    parser.add_argument("--ntest", type=int, help="override the number of testing events", default=None)
-    parser.add_argument("--recreate", action="store_true", help="recreate a new output dir", default=None)
-    parser.add_argument("--raw-path", type=str, help="Override the dataset raw files path", default=None)
-    parser.add_argument("--processed-path", type=str, help="Override the dataset processed files path", default=None)
-    args = parser.parse_args()
-    return args
-
-def apply_modifier_retrain_energy(config):
-    assert(config["parameters"]["model"] == "gnn_dense")
-    config["setup"]["trainable"] = "ffn_momentum4"
-    for loss in [
-        "classification_loss_coef",
-        "charge_loss_coef",
-        "pt_loss_coef",
-        "eta_loss_coef",
-        "sin_phi_loss_coef",
-        "cos_phi_loss_coef"]:
-        config["dataset"][loss] = 0.0
-
-    config["dataset"]["energy_loss_coef"] = 1.0
-    config["setup"]["batch_size"] = 20
-    return config
-
-modifiers = {
-    "retrain_energy": apply_modifier_retrain_energy
-}
-
-if __name__ == "__main__":
-    args = parse_args()
-    yaml_path = args.model_spec
-
-    config = load_config(yaml_path)
-
-    if args.modifier:
-        config = modifiers[args.modifier](config)
-
-    if config["backend"] == "tensorflow":
-        tfmodel.model_setup.main(args, yaml_path, config)
-    elif config["backend"] == "pytorch":
-        pass
diff --git a/mlpf/lrp/__init__.py b/mlpf/lrp/__init__.py
new file mode 100644
index 000000000..d59653ee0
--- /dev/null
+++ b/mlpf/lrp/__init__.py
@@ -0,0 +1,8 @@
+from lrp.args import parse_args
+from lrp.plots import make_heatmaps
+from lrp.model_io import model_io
+from lrp.model_lrp import PFNet7
+from lrp.gravnet_lrp import GravNetConv
+
+from lrp.lrp_clf_gpu import lrp_clf
+from lrp.lrp_reg_gpu import lrp_reg
diff --git a/mlpf/lrp/args.py b/mlpf/lrp/args.py
new file mode 100644
index 000000000..61b0fed9e
--- /dev/null
+++ b/mlpf/lrp/args.py
@@ -0,0 +1,91 @@
+import argparse
+from math import inf
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--lrp_dataset_qcd", type=str, default='../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=True)
+    parser.add_argument("--lrp_outpath", type=str, default = '../test_tmp_delphes/experiments/lrp/', help="Output folder for the lrp relevance scores and heatmaps", required=True)
+
+    # usual specs
+    parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing lrp.. each file contains 100 events")
+    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
+
+    parser.add_argument("--hidden_dim", type=int, default=256, help="hidden dimension")
+    parser.add_argument("--hidden_dim_nn1", type=int, default=64, help="hidden dimension")
+    parser.add_argument("--input_encoding", type=int, default=12, help="use an input encoding layer")
+    parser.add_argument("--encoding_dim", type=int, default=64, help="encoded element dimension")
+    parser.add_argument("--space_dim", type=int, default=4, help="Spatial dimension for clustering in gravnet layer")
+    parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
+    parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
+
+    # extras for lrp
+    parser.add_argument("--explain", action=BoolArg, default=True, help="General setup mode: if True then you want to explain.. if False then you will load an already explained model (already made R-scores)..")
+    parser.add_argument("--lrp_reg", action=BoolArg, default=True, help="Works only if --explain is True.. Runs lrp for interpreting the regression part..")
+    parser.add_argument("--lrp_clf", action=BoolArg, default=True, help="Works only if --explain is True.. Runs lrp for interpreting the classification part..")
+
+    parser.add_argument("--lrp_load_model", type=str, default="/PFNet7_cand_ntrain_2", help="Loads the model to explain (or only make_heatmaps)", required=False)
+    parser.add_argument("--lrp_load_epoch", type=int, default=0, help="Loads the epoch after which to explain (or only make_heatmaps)")
+
+    parser.add_argument("--make_heatmaps_reg", action=BoolArg, default=True, help="Constructs heatmaps for the regressed p4 (must run with explain=True or else you load a pre-explained model with explain=False)..")
+    parser.add_argument("--make_heatmaps_clf", action=BoolArg, default=True, help="Constructs heatmaps for the classified pid (must run with explain=True or else you load a pre-explained model with explain=False)..")
+
+    args = parser.parse_args()
+
+    return args
+
+
+class BoolArg(argparse.Action):
+    """
+    Take an argparse argument that is either a boolean or a string and return a boolean.
+    """
+    def __init__(self, default=None, nargs=None, *args, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+
+        # Set default
+        if default is None:
+            raise ValueError("Default must be set!")
+
+        default = _arg_to_bool(default)
+
+        super().__init__(*args, default=default, nargs='?', **kwargs)
+
+    def __call__(self, parser, namespace, argstring, option_string):
+
+        if argstring is not None:
+            # If called with an argument, convert to bool
+            argval = _arg_to_bool(argstring)
+        else:
+            # BoolArg will invert default option
+            argval = True
+
+        setattr(namespace, self.dest, argval)
+
+def _arg_to_bool(arg):
+    # Convert argument to boolean
+
+    if type(arg) is bool:
+        # If argument is bool, just return it
+        return arg
+
+    elif type(arg) is str:
+        # If string, convert to true/false
+        arg = arg.lower()
+        if arg in ['true', 't', '1']:
+            return True
+        elif arg in ['false', 'f', '0']:
+            return False
+        else:
+            return ValueError('Could not parse a True/False boolean')
+    else:
+        raise ValueError('Input must be boolean or string! {}'.format(type(arg)))
+
+
+# From https://stackoverflow.com/questions/12116685/how-can-i-require-my-python-scripts-argument-to-be-a-float-between-0-0-1-0-usin
+class Range(object):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+    def __eq__(self, other):
+        return self.start <= other <= self.end
diff --git a/mlpf/lrp/gravnet_lrp.py b/mlpf/lrp/gravnet_lrp.py
new file mode 100644
index 000000000..67da14e1b
--- /dev/null
+++ b/mlpf/lrp/gravnet_lrp.py
@@ -0,0 +1,125 @@
+from typing import Optional, Union
+from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
+
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+
+try:
+    from torch_cluster import knn
+except ImportError:
+    knn = None
+
+# copied it from pytorch_geometric source code
+# ADDED: retrieve edge_index, retrieve edge_weight
+# CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
+# REMOVED: skip connection
+
+# ADDED: retrieve before and after message MessagePassing
+
+class GravNetConv(MessagePassing):
+    r"""The GravNet operator from the `"Learning Representations of Irregular
+    Particle-detector Geometry with Distance-weighted Graph
+    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
+    dynamically constructed using nearest neighbors.
+    The neighbors are constructed in a learnable low-dimensional projection of
+    the feature space.
+    A second projection of the input feature space is then propagated from the
+    neighbors to each vertex using distance weights that are derived by
+    applying a Gaussian function to the distances.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        space_dimensions (int): The dimensionality of the space used to
+           construct the neighbors; referred to as :math:`S` in the paper.
+        propagate_dimensions (int): The number of features to be propagated
+           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
+           paper.
+        k (int): The number of nearest neighbors.
+        num_workers (int): Number of workers to use for k-NN computation.
+            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
+            lies on the GPU. (default: :obj:`1`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 space_dimensions: int, propagate_dimensions: int, k: int,
+                 num_workers: int = 1, **kwargs):
+        super(GravNetConv, self).__init__(flow='target_to_source', **kwargs)
+
+        if knn is None:
+            raise ImportError('`GravNetConv` requires `torch-cluster`.')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.k = k
+        self.num_workers = num_workers
+
+        self.lin_s = Linear(in_channels, space_dimensions)
+        self.lin_h = Linear(in_channels, propagate_dimensions)
+        self.lin_p = Linear(propagate_dimensions, out_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_s.reset_parameters()
+        self.lin_h.reset_parameters()
+        self.lin_p.reset_parameters()
+
+
+    def forward(
+            self, x: Union[Tensor, PairTensor],
+            batch: Union[OptTensor, Optional[PairTensor]] = None) -> Tensor:
+        """"""
+
+        is_bipartite: bool = True
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+            is_bipartite = False
+        assert x[0].dim() == 2, 'Static graphs not supported in `GravNetConv`.'
+
+        b: PairOptTensor = (None, None)
+        if isinstance(batch, Tensor):
+            b = (batch, batch)
+        elif isinstance(batch, tuple):
+            assert batch is not None
+            b = (batch[0], batch[1])
+
+        h_l: Tensor = self.lin_h(x[0])
+
+        s_l: Tensor = self.lin_s(x[0])
+        s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
+
+        edge_index = knn(s_l, s_r, self.k, b[0], b[1],
+                         num_workers=self.num_workers)
+
+        edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
+        edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
+
+        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
+        out = self.propagate(edge_index, x=(h_l, None),
+                             edge_weight=edge_weight,
+                             size=(s_l.size(0), s_r.size(0)))
+
+        return self.lin_p(out), edge_index, edge_weight, out, h_l
+
+
+    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
+        return x_j * edge_weight.unsqueeze(1)
+
+    def aggregate(self, inputs: Tensor, index: Tensor,
+                  dim_size: Optional[int] = None) -> Tensor:
+        out_mean = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                           reduce='sum')
+        out_max = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                          reduce='max')
+        # return torch.cat([out_mean, out_max], dim=-1)
+        return out_mean
+
+    def __repr__(self):
+        return '{}({}, {}, k={})'.format(self.__class__.__name__,
+                                         self.in_channels, self.out_channels,
+                                         self.k)
diff --git a/mlpf/lrp/lrp_clf_gpu.py b/mlpf/lrp/lrp_clf_gpu.py
new file mode 100644
index 000000000..831a8fba7
--- /dev/null
+++ b/mlpf/lrp/lrp_clf_gpu.py
@@ -0,0 +1,251 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+from torch_geometric.utils import to_scipy_sparse_matrix
+import scipy
+import pickle, math, time
+import _pickle as cPickle
+from sys import getsizeof
+from tqdm import tqdm
+
+from torch_geometric.data import Data
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from torch_geometric.utils import to_dense_adj
+
+import lrp
+
+class lrp_clf:
+    EPSILON=1e-9
+
+    def __init__(self, device, model:lrp.model_io):
+        self.device=device
+        self.model=model
+
+    def register_model(model:lrp.model_io):
+        self.model=model
+
+    """
+    lrp rules
+    """
+
+    # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
+    @staticmethod
+    def easy_rule(self, layer, input, R, index,output_layer, activation_layer, print_statement):
+        EPSILON=1e-9
+        # input.retain_grad()
+        # z = layer.forward(input)
+        # basically layer.forward does this: output=(torch.matmul(input,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).to(self.device)
+        else:
+            w = layer.weight.detach().to(self.device)
+
+        if output_layer: # for the output layer
+            T, W, r = [], [], []
+
+            for i in range(R.shape[1]):
+                T.append(R[:,i].reshape(-1,1).to(self.device))
+                W.append(w[i,:].reshape(1,-1).to(self.device))
+                I = torch.ones_like(R[:,i]).reshape(-1,1).to(self.device)
+
+                Numerator = (input*torch.matmul(T[i],W[i]))
+                Denominator = (input*torch.matmul(I,W[i])).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                r.append(torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            print('- Finished computing R-scores')
+            return r
+        else:
+            for i in range(len(R)):
+                I = torch.ones_like(R[i])
+
+                Numerator = (input*torch.matmul(R[i],w))
+                Denominator = (input*torch.matmul(I,w)).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                R[i]=(torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            print('- Finished computing R-scores')
+            return R
+
+
+    @staticmethod
+    def eps_rule(self, layer, input, R, index, output_layer, activation_layer, print_statement, adjacency_matrix=None, message_passing=False):
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).detach().to(self.device)
+        elif message_passing: # message passing hack
+            w = adjacency_matrix.detach().to(self.device)
+        else:
+            w = layer.weight.detach().to(self.device)
+
+        wt = torch.transpose(w,0,1)
+
+        if output_layer:
+            R_list = [None]*R.shape[1]
+            Wt = [None]*R.shape[1]
+            for output_neuron in range(R.shape[1]):
+                R_list[output_neuron] = (R[:,output_neuron].reshape(-1,1).clone())
+                Wt[output_neuron] = (wt[:,output_neuron].reshape(-1,1))
+        else:
+            R_list = R
+            Wt = [wt]*len(R_list)
+
+        R_previous=[None]*len(R_list)
+
+        for output_neuron in range(len(R_list)):
+
+            if message_passing: # message passing hack
+                R_list[output_neuron] = torch.transpose(R_list[output_neuron],0,1)
+
+            # rep stands for repeated/expanded
+            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(self.device)
+            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(self.device)
+
+            H = a_rep*wt_rep
+            deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,input.shape[1],-1)
+
+            G = H/deno
+
+            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(self.device)))
+            R_previous[output_neuron] = R_previous[output_neuron].reshape(R_previous[output_neuron].shape[0], R_previous[output_neuron].shape[1]).to('cpu')
+
+            if message_passing: # message passing hack
+                R_previous[output_neuron] = torch.transpose(R_previous[output_neuron],0,1)
+
+        if print_statement:
+            print('- Finished computing R-scores')
+            if message_passing:
+                if (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+            else:
+                if (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+
+        return R_previous
+
+    @staticmethod
+    def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weight, after_message, before_message, index, outpath, load_model):
+
+        # first time you hit message passing: construct and start filling the big tensor from scratch
+        if len(big_list)==0:
+            big_list = [[torch.zeros(R[0].shape[0],R[0].shape[1]) for i in range(len(R))] for i in range(R[0].shape[0])]
+            print('- Finished allocating memory for the big tensor of R-scores for all nodes')
+
+            for node_i in range(len(big_list)):
+                for output_neuron in range(len(big_list[0])):
+                    big_list[node_i][output_neuron][node_i] = R[output_neuron][node_i]
+            print('- Finished initializing the big tensor')
+
+        # build the adjacency matrix
+        A = to_dense_adj(edge_index, edge_attr=edge_weight)[0] # adjacency matrix
+
+        if torch.allclose(torch.matmul(A, before_message), after_message, rtol=1e-3):
+            print("- Adjacency matrix is correctly computed")
+
+        # # the following saves a version of the R-scores before the message passing
+        # torch.save(big_list, outpath + '/lrp/R_score_layer_before_msg_passing.pt')
+
+        # modify the big tensor based on message passing rule
+        for node_i in tqdm(range(len(big_list))):
+            big_list[node_i] = self.eps_rule(self, layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, adjacency_matrix=A, message_passing=True)
+            print(f'- Finished computing R-score for node {node_i+1}/{len(big_list)} for the message passing..')
+        print('- Finished computing R-scores for the message passing layer')
+        return big_list
+
+
+    """
+    explanation functions
+    """
+
+    def explain(self, to_explain):
+
+        start_index = self.model.n_layers                  ##########################
+        outpath = to_explain["outpath"]+'/'+to_explain["load_model"]
+
+        print('Total number of layers (including activation layers):', start_index)
+
+        # store the R-scores for the output layer (they are basically the model predictions)
+        torch.save(to_explain["pred_id"].detach(), outpath + f'/lrp/R_score_layer{start_index+1}.pt')
+
+        ### loop over each single layer
+        big_list = []
+        output_layer_index = start_index+1
+        for index in range(start_index+1, 1,-1):
+            if index==start_index+1:
+                R, big_list, output_layer_index  = self.explain_single_layer(to_explain["pred_id"].detach(), to_explain, big_list, outpath, output_layer_index, index)
+            else:
+                R, big_list, output_layer_index  = self.explain_single_layer(R, to_explain, big_list, outpath, output_layer_index, index)
+        print("Finished explaining all layers.")
+        return big_list      # returns the heatmaps for layer0 (i.e. input features)
+
+
+    def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_index, index=None, name=None):
+
+        layer = self.model.get_layer(index=index,name=name)
+
+        if name is None:
+            name = self.model.index2name(index)
+        if index is None:
+            index = self.model.name2index(name)
+
+        input = to_explain['A'][name].detach()
+
+        # skip the last DNN because this is only for classification
+        if 'nn3' in str(name):
+            print('Skipping a DNN3 regression layer..')
+            output_layer_index = output_layer_index-1
+            return R, big_list, output_layer_index
+
+        if index==output_layer_index:
+            output_layer_bool = True
+        else:
+            output_layer_bool = False
+
+        # it works out of the box that the conv1.lin_s layer which we don't care about is in the same place of the message passing.. so we can just replace its action
+        if 'conv1.lin_s' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: Message Passing")
+            big_list = self.message_passing_rule(self, layer, input, R, big_list, to_explain["edge_index"].detach(), to_explain["edge_weight"].detach(), to_explain["after_message"].detach(), to_explain["before_message"].detach(), index, outpath, to_explain["load_model"])
+            return R, big_list, output_layer_index
+
+        print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+
+        if len(big_list)==0:  # if you haven't hit the message passing step yet
+            if 'Linear' in str(layer):
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+            elif 'LeakyReLU' or 'ELU' in str(layer):
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
+        else:
+            for node_i in tqdm(range(len(big_list))):
+                if 'Linear' in str(layer):
+                    big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                elif 'LeakyReLU' or 'ELU' in str(layer):
+                    big_list[node_i] =  self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
+        return R, big_list, output_layer_index
+
+##-----------------------------------------------------------------------------
+# # big_list is a list of length 5k
+# # each element is another list of length 6 (corresponding to each of the output pid probability prediction)
+# # each element of that second list is a tensor of shape (5k,x) where x is the dimension of the latent space
diff --git a/mlpf/lrp/lrp_reg_gpu.py b/mlpf/lrp/lrp_reg_gpu.py
new file mode 100644
index 000000000..9f3ca31ab
--- /dev/null
+++ b/mlpf/lrp/lrp_reg_gpu.py
@@ -0,0 +1,311 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+from torch_geometric.utils import to_scipy_sparse_matrix
+import scipy
+import pickle, math, time
+import _pickle as cPickle
+from sys import getsizeof
+from tqdm import tqdm
+
+from torch_geometric.data import Data
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from torch_geometric.utils import to_dense_adj
+
+import lrp
+
+class lrp_reg:
+    EPSILON=1e-9
+
+    def __init__(self, device, model:lrp.model_io):
+        self.device=device
+        self.model=model
+
+    def register_model(model:lrp.model_io):
+        self.model=model
+
+    """
+    lrp rules
+    """
+
+    # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
+    @staticmethod
+    def easy_rule(self, layer, input, R, index, output_layer, activation_layer, print_statement, skip_connection=False, adjacency_matrix=False, message_passing=False):
+        EPSILON=1e-9
+        # input.retain_grad()
+        # z = layer.forward(input)
+        # basically layer.forward does this: output=(torch.matmul(input,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).to(self.device)
+        else:
+            w = layer.weight.detach().to(self.device)
+
+        if output_layer: # for the output layer
+            T, W, r = [], [], []
+
+            for i in range(R.shape[1]):
+                T.append(R[:,i].reshape(-1,1).to(self.device))
+                W.append(w[i,:].reshape(1,-1).to(self.device))
+                I = torch.ones_like(R[:,i]).reshape(-1,1).to(self.device)
+
+                Numerator = (input*torch.matmul(T[i],W[i]))
+                Denominator = (input*torch.matmul(I,W[i])).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                r.append(torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            if print_statement:
+                print('- Finished computing R-scores')
+            return r
+        else:
+            for i in range(len(R)):
+                I = torch.ones_like(R[i])
+
+                Numerator = (input*torch.matmul(R[i],w))
+                Denominator = (input*torch.matmul(I,w)).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                R[i] = (torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            if skip_connection:
+                input_relevance, pid_relevance, embedding_relevance = [None]*len(R), [None]*len(R), [None]*len(R)
+                for output_neuron in range(len(R)):
+                    input_relevance[output_neuron] = R[output_neuron][:,:12]
+                    pid_relevance[output_neuron] = R[output_neuron][:,12:18]
+                    embedding_relevance[output_neuron] = R[output_neuron][:,18:]
+                return input_relevance, pid_relevance, embedding_relevance
+
+            if print_statement:
+                print('- Finished computing R-scores')
+            return R
+
+
+    @staticmethod
+    def eps_rule(self, layer, input, R, index, output_layer, activation_layer, print_statement, skip_connection=False, adjacency_matrix=None, message_passing=False):
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).detach().to(self.device)
+        elif message_passing: # message passing hack
+            w = adjacency_matrix.detach().to(self.device)
+        else:
+            w = layer.weight.detach().to(self.device)
+
+        wt = torch.transpose(w,0,1)
+
+        if output_layer:
+            R_list = [None]*R.shape[1]
+            Wt = [None]*R.shape[1]
+            for output_neuron in range(R.shape[1]):
+                R_list[output_neuron] = (R[:,output_neuron].reshape(-1,1).clone())
+                Wt[output_neuron] = (wt[:,output_neuron].reshape(-1,1))
+        else:
+            R_list = R
+            Wt = [wt]*len(R_list)
+
+        R_previous=[None]*len(R_list)
+
+        for output_neuron in range(len(R_list)):
+
+            if message_passing: # message passing hack
+                R_list[output_neuron] = torch.transpose(R_list[output_neuron],0,1)
+
+            # rep stands for repeated/expanded
+            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(self.device)
+            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(self.device)
+
+            H = a_rep*wt_rep
+            deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,input.shape[1],-1)
+
+            G = H/deno
+
+            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(self.device)))
+            R_previous[output_neuron] = R_previous[output_neuron].reshape(R_previous[output_neuron].shape[0], R_previous[output_neuron].shape[1]).to('cpu')
+
+            if message_passing: # message passing hack
+                R_previous[output_neuron] = torch.transpose(R_previous[output_neuron],0,1)
+
+        if print_statement:
+            print('- Finished computing R-scores')
+            if message_passing:
+                if (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+            else:
+                if (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+
+        if skip_connection:
+            input_relevance, pid_relevance, embedding_relevance = [None]*len(R_list), [None]*len(R_list), [None]*len(R_list)
+            for output_neuron in range(len(R_list)):
+                input_relevance[output_neuron] = R_previous[output_neuron][:,:12]
+                pid_relevance[output_neuron] = R_previous[output_neuron][:,12:18]
+                embedding_relevance[output_neuron] = R_previous[output_neuron][:,18:]
+            return input_relevance, pid_relevance, embedding_relevance
+
+        return R_previous
+
+    @staticmethod
+    def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weight, after_message, before_message, index, outpath, load_model):
+
+        # first time you hit message passing: construct and start filling the big tensor from scratch
+        if len(big_list)==0:
+            # big_list = [[torch.zeros(R[0].shape[0],R[0].shape[1])]*len(R)]*R[0].shape[0]   # this is wrong but it's faster for debugging (the correct way is the following line)
+            big_list = [[torch.zeros(R[0].shape[0],R[0].shape[1]) for i in range(len(R))] for i in range(R[0].shape[0])]
+            print('- Finished allocating memory for the big tensor of R-scores for all nodes')
+
+            for node_i in range(len(big_list)):
+                for output_neuron in range(len(big_list[0])):
+                    big_list[node_i][output_neuron][node_i] = R[output_neuron][node_i]
+            print('- Finished initializing the big tensor')
+
+        # build the adjacency matrix
+        A = to_dense_adj(edge_index, edge_attr=edge_weight)[0] # adjacency matrix
+
+        if torch.allclose(torch.matmul(A, before_message), after_message, rtol=1e-3):
+            print("- Adjacency matrix is correctly computed")
+
+        # # the following saves a version of the R-scores before the message passing
+        # torch.save(big_list, outpath + '/lrp/R_score_layer_before_msg_passing.pt')
+
+        # modify the big tensor based on message passing rule
+        for node_i in tqdm(range(len(big_list))):
+            big_list[node_i] = self.eps_rule(self, layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, skip_connection=False, adjacency_matrix=A, message_passing=True)
+            print(f'- Finished computing R-score for node {node_i+1}/{len(big_list)} for the message passing..')
+        print('- Finished computing R-scores for the message passing layer')
+        return big_list
+
+
+    """
+    explanation functions
+    """
+
+    def explain(self,
+                to_explain:dict,
+                save:bool=True,
+                save_to:str="./relevance.pt",
+                sort_nodes_by:int=0,
+                return_result:bool=False):
+
+        start_index = self.model.n_layers                  ##########################
+        outpath = to_explain["outpath"]+'/'+to_explain["load_model"]
+
+        print('Total number of layers (including activation layers):', start_index)
+
+        # store the R-scores for the output layer (they are basically the model predictions)
+        torch.save(to_explain["pred_p4"].detach(), outpath + f'/lrp/R_score_layer{start_index+1}.pt')
+
+        ### loop over each single layer
+        big_list = []
+        for index in range(start_index+1, 1,-1):
+            if index==start_index+1:
+                R, big_list  = self.explain_single_layer(to_explain["pred_p4"].detach(), to_explain, big_list, outpath, start_index+1, index)
+            else:
+                R, big_list  = self.explain_single_layer(R, to_explain, big_list, outpath, start_index+1, index)
+        print("Finished explaining all layers.")
+        return big_list      # returns the heatmaps for layer0 (i.e. input features)
+
+    def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_index, index=None, name=None):
+        # preparing variables required for computing lrp
+        layer = self.model.get_layer(index=index,name=name)
+
+        if name is None:
+            name = self.model.index2name(index)
+        if index is None:
+            index = self.model.name2index(name)
+
+        input = to_explain['A'][name].detach()
+
+        if index==output_layer_index:
+            output_layer_bool = True
+        else:
+            output_layer_bool = False
+
+        #### THERE ARE 4 SPECIAL LAYERS TO BE TREATED UNIQUELY
+        # (1) for skip connection purposes
+        if 'nn3.0' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer} - Skip connection")
+            input_relevance, pid_relevance, embedding_relevance = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True, skip_connection=True)
+
+            torch.save(input_relevance, outpath + f'/lrp/input_relevance.pt')
+            torch.save(embedding_relevance, outpath + f'/lrp/embedding_relevance.pt')
+
+            return pid_relevance, big_list
+
+        # (2) for skip connection purposes
+        if 'nn2.0' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+            R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+
+            # add the embedding_relevance computed in the nn3.0 skip connection
+            embedding_relevance = torch.load(outpath + f'/lrp/embedding_relevance.pt', map_location=torch.self.device('cpu'))
+
+            for i in range(len(R)):
+                R[i] = R[i] + embedding_relevance[i]
+
+            return R, big_list
+
+        # (3) for skip connection purposes
+        if 'nn1.0' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+
+            # add the input_relevance computed in the nn3.0 skip connection
+            input_relevance = torch.load(outpath + f'/lrp/input_relevance.pt', map_location=torch.self.device('cpu'))
+
+            for node_i in tqdm(range(len(big_list))):
+                big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                for i in range(len(R)):
+                    # for row in range(len(big_list[node_i][i])):
+                    #     # check if row is nonzero
+                    #     if big_list[node_i][i][row,:].sum()!=0:
+                    #         big_list[node_i][i][row,:] = big_list[node_i][i][row,:] + input_relevance[i][row,:]
+                    big_list[node_i][i][node_i,:] = big_list[node_i][i][node_i,:] + input_relevance[i][node_i,:]
+
+            return R, big_list
+
+        # (4) Message Passing: it works out of the box that the conv1.lin_s layer which we don't care about is in the same place of the message passing.. so we can just replace its action
+        if 'conv1.lin_s' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: Message Passing")
+            big_list = self.message_passing_rule(self, layer, input, R, big_list, to_explain["edge_index"].detach(), to_explain["edge_weight"].detach(), to_explain["after_message"].detach(), to_explain["before_message"].detach(), index, outpath, to_explain["load_model"])
+            return R, big_list
+
+        # All the other layers:
+        print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+
+        if len(big_list)==0:  # if you haven't hit the message passing step yet
+            if 'Linear' in str(layer):
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+            elif 'LeakyReLU' or 'ELU' in str(layer):
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
+        else:
+            # in this way: big_list is a list of length 5k (nodes) that contains a list of length 6 (output_neurons) that contains tensors (5k,x) which are the heatmap of R-scores
+            for node_i in tqdm(range(len(big_list))):
+                if 'Linear' in str(layer):
+                    big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                elif 'LeakyReLU' or 'ELU' in str(layer):
+                    big_list[node_i] =  self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
+        return R, big_list
+
+##-----------------------------------------------------------------------------
+# # big_list is a list of length 5k
+# # each element is another list of length 6 (corresponding to each of the output pid probability prediction)
+# # each element of that second list is a tensor of shape (5k,x) where x is the dimension of the latent space
diff --git a/mlpf/lrp/model_io.py b/mlpf/lrp/model_io.py
new file mode 100644
index 000000000..bdbe73123
--- /dev/null
+++ b/mlpf/lrp/model_io.py
@@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+
+class model_io:
+    SPECIAL_LAYERS=[
+        ".nn2.0",
+        # ".conv1.lin_h",
+        # ".conv1.lin_p"
+    ]
+
+    def __init__(self,device,model,
+                model_state_dict,
+                activation_dest, dic):
+
+        self.device=device
+        self.model=model
+        self.model.load_state_dict(model_state_dict)
+        self.dest=activation_dest
+        self.dic=dic
+
+        # declare variables
+        self.L=dict()           # layers
+        self.A=activation_dest  # activations
+        # self.R=dict()          # relevance scores
+
+        self._rules=dict()     # rules to use for each layer
+        self._hook_handles=[]  # collection of all hook handles
+
+        # # extract layers and register hooks
+        # self._extract_layers("",model,)
+
+        self.L=dict()
+        for name, module in model.named_modules():
+            # print(name)
+            if name=='conv1' or name=='conv2':
+                self.L[name]=module
+            else:
+                self.L['.'+name]=module
+
+        for key, value in list(self.L.items()):
+            if key not in self.dic.keys():
+                del self.L[key]
+
+        self.n_layers=len(self.L.keys())
+
+        # register rules for each layer
+        self._register_rules()
+
+    """
+    rules functions
+    """
+    def _register_rules(self):
+        for layer_name in self.L.keys():
+            layer=self.L[layer_name]
+            layer_class=layer.__class__.__name__
+            if layer_class=="BatchNorm1d":
+                rule="z"
+            else:
+                rule="eps"
+            self._rules[layer_name]=rule
+
+    def get_rule(self,index=None,layer_name=None):
+        assert (not index is None) or (not layer_name is None), "at least one of (index,name) must be provided"
+        if layer_name is None:
+            layer_name=self.index2name(index)
+
+        if hasattr(self,"_rules"):
+            return self._rules[layer_name]
+        else:
+            self._register_rules()
+            return self._rules[layer_name]
+
+    """
+    layer functions
+    """
+
+    def _extract_layers(self,name,model):
+        l=list(model.named_children())
+
+        if len(l)==0:
+            self.L[name]=copy_layer(model)
+        else:
+            l=list(model.named_children())
+            for i in l:
+                self._extract_layers(name+"."+i[0],i[1])
+
+    def get_layer(self,index=None,name=None):
+        assert (not index is None) or (not name is None), "at least one of (index,name) must be provided"
+        if name is None:
+            name=self.index2name(index)
+        return self.L[name]
+
+    """
+    general getters
+    """
+    def index2name(self,idx:int)->str:
+        if not hasattr(self,"_i2n"):
+            self._i2n=[]
+            for i,n in enumerate(self.A.keys()):
+                self._i2n.append(n)
+        return self._i2n[idx-2]
+
+    def name2index(self,name:str)->int:
+        if not hasattr(self,"_i2n"):
+            self._i2n=[]
+            for i,n in enumerate(self.A.keys()):
+                self._i2n.append(n)
+        return self._i2n.index(name)
+
+    """
+    reset and setter functions
+    """
+    def _clear_hooks(self):
+        for hook in self._hook_handles:
+            hook.remove()
+
+    def reset(self):
+        """
+        reset the prepared model
+        """
+        pass
+        # self._clear_hooks()
+        # self.A=dict()
+        # self.R=dict()
+
+    def set_dest(self,activation_dest):
+        self.A=activation_dest
+
+def copy_layer(layer):
+    """
+    create a deep copy of provided layer
+    """
+    layer_cp=eval("nn."+layer.__repr__())
+    layer_cp.load_state_dict(layer.state_dict())
+
+    return layer_cp.to(self.device)
+
+def copy_tensor(tensor,dtype=torch.float32):
+    """
+    create a deep copy of the provided tensor,
+    outputs the copy with specified dtype
+    """
+
+    return tensor.clone().detach().requires_grad_(True).to(self.device)
diff --git a/mlpf/lrp/model_lrp.py b/mlpf/lrp/model_lrp.py
new file mode 100644
index 000000000..502e7b93a
--- /dev/null
+++ b/mlpf/lrp/model_lrp.py
@@ -0,0 +1,84 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+
+import lrp
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True):
+
+        super(PFNet7, self).__init__()
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.elu = nn.ELU
+
+        # (1) DNN
+        self.nn1 = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, input_encoding),
+        )
+
+        # (2) CNN: Gravnet layer
+        self.conv1 = lrp.GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+        # (4) DNN layer: regressing p4
+        self.nn3 = nn.Sequential(
+            nn.Linear(encoding_dim + output_dim_id + input_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_p4),
+        )
+
+
+    def forward(self, data):
+
+        x0 = data.x
+
+        # Encoder/Decoder step
+        x = self.nn1(x0)
+
+        # Gravnet step
+        x, edge_index, edge_weight, after_message, before_message = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+        pred_p4 = self.nn3(nn3_input)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand, edge_index, edge_weight, after_message, before_message
diff --git a/mlpf/lrp/plots.py b/mlpf/lrp/plots.py
new file mode 100644
index 000000000..d9927e1d3
--- /dev/null
+++ b/mlpf/lrp/plots.py
@@ -0,0 +1,166 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import torch
+
+def map_index_to_pid(id):
+    if id==0:
+        return 'null'
+    if id==1:
+        return 'charged hadron'
+    if id==2:
+        return 'neutral hadron'
+    if id==3:
+        return 'photon'
+    if id==4:
+        return 'electron'
+    if id==5:
+        return 'muon'
+
+def map_index_to_p4(index):
+    if index==0:
+        return 'charge'
+    if index==1:
+        return 'pt'
+    if index==2:
+        return 'eta'
+    if index==3:
+        return 'sin phi'
+    if index==4:
+        return 'cos phi'
+    if index==5:
+        return 'energy'
+
+def make_heatmaps(big_list, to_explain, device, outpath, output_dim_id, output_dim_p4, task):
+
+    print(f'Making heatmaps for {task}..')
+
+    X = to_explain["inputs"]
+    gen_ids_one_hot = to_explain["gen_id"]
+    pred_ids_one_hot = to_explain["pred_id"]
+
+    gen_ids = gen_ids_one_hot.argmax(axis=1)
+    pred_ids = pred_ids_one_hot.argmax(axis=1)
+
+    # make directories to hold the heatmaps
+    for i in range(6):
+        if not osp.isdir(outpath + '/lrp'):
+            os.makedirs(outpath + '/lrp')
+        if not osp.isdir(outpath + f'/lrp/class{str(i)}'):
+            os.makedirs(outpath + f'/lrp/class{str(i)}')
+        for j in range(6):
+            if task=='regression':
+                if not osp.isdir(outpath + f'/lrp/class{str(i)}'+f'/p4_elem{str(j)}'):
+                    os.makedirs(outpath + f'/lrp/class{str(i)}'+f'/p4_elem{str(j)}')
+            elif task=='classification':
+                if not osp.isdir(outpath + f'/lrp/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/lrp/class{str(i)}'+f'/pid{str(j)}')
+
+    # attempt to break down big_list onto 6 smaller lists, 1 for each pid
+    list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
+    dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
+
+    for node_i in range(len(big_list)):  # iterate over the nodes
+
+        if gen_ids[node_i]==0:  # if it's a null then add it to the null list
+            list0.append(big_list[node_i])
+            dist0.append(node_i)
+        if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
+            list1.append(big_list[node_i])
+            dist1.append(node_i)
+        if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
+            list2.append(big_list[node_i])
+            dist2.append(node_i)
+        if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
+            list3.append(big_list[node_i])
+            dist3.append(node_i)
+        if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
+            list4.append(big_list[node_i])
+            dist4.append(node_i)
+        if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
+            list5.append(big_list[node_i])
+            dist5.append(node_i)
+
+    list = [list0,list1,list2,list3,list4,list5]
+    dist = [dist0,dist1,dist2,dist3,dist4,dist5]
+
+    if task=='regression':
+        output_dim = output_dim_p4
+    elif task=='classification':
+        output_dim = output_dim_id
+
+    for pid in range(output_dim_id):
+        if pid!=1:
+            continue
+        for node_i in range(len(list[pid])): # iterate over the nodes in each list
+            print('- making heatmap for', map_index_to_pid(pid), 'node #:', node_i+1, '/', len(list[pid]))
+            for output_neuron in range(output_dim):
+                R_cat_feat = torch.cat([list[pid][node_i][output_neuron].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
+
+                non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
+                R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
+                pos = dist[pid][node_i]
+                probability = pred_ids_one_hot[pos]
+
+                def get_type(t):
+                    l = []
+                    for elem in t:
+                        if elem==1:
+                            l.append('cluster')
+                        if elem==2:
+                            l.append('track')
+                    return l
+
+                node_types = get_type(R_cat_feat_msk[:,12])
+
+                fig, ax = plt.subplots()
+                fig.tight_layout()
+
+                if task=='regression':
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                elif task=='classification':
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
+                if pid==1:
+                    features = ["type", " pt", "eta",
+                           "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
+                else:
+                    features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
+
+                ax.set_xticks(np.arange(len(features)))
+                ax.set_yticks(np.arange(len(node_types)))
+                for col in range(len(features)):
+                    for row in range(len(node_types)):
+                        text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
+                                       ha="center", va="center", color="w")
+                # ... and label them with the respective list entries
+                ax.set_xticklabels(features)
+                ax.set_yticklabels(node_types)
+                plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
+                plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
+                plt.colorbar()
+                fig.set_size_inches(12, 12)
+                if task=='regression':
+                    plt.savefig(outpath + f'/lrp/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                elif task=='classification':
+                    plt.savefig(outpath + f'/lrp/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                plt.close(fig)
diff --git a/mlpf/lrp_pipeline.py b/mlpf/lrp_pipeline.py
new file mode 100644
index 000000000..e57d9c8e7
--- /dev/null
+++ b/mlpf/lrp_pipeline.py
@@ -0,0 +1,210 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+
+from pytorch_delphes import PFGraphDataset, data_to_loader_ttbar, data_to_loader_qcd
+from lrp import parse_args, make_heatmaps, model_io, PFNet7, lrp_clf, lrp_reg
+
+# NOTE: this script works by loading an already trained model with very specefic specs
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'n_test': 2, 'batch_size': 1,' hidden_dim':256, 'hidden_dim_nn1': 64,
+    # 'input_encoding': 12, 'encoding_dim': 64, 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16,
+    # 'lrp_dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'lrp_dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    # 'lrp_outpath': '../test_tmp_delphes/experiments/lrp/',
+    # 'lrp_load_epoch': 9, 'lrp_load_model': 'lrp_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'explain': True, 'lrp_clf': False, 'lrp_reg': False,
+    # 'make_heatmaps_clf': True,'make_heatmaps_reg': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_qcd = PFGraphDataset(args.lrp_dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loader..')
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    outpath = args.lrp_outpath + args.lrp_load_model
+    PATH = outpath + '/epoch_' + str(args.lrp_load_epoch) + '_weights.pth'
+
+    # loading the model
+    print('Loading a previously trained model..')
+    with open(outpath + '/model_kwargs.pkl', 'rb') as f:
+        model_kwargs = pkl.load(f)
+
+    model = PFNet7(**model_kwargs)
+
+    state_dict = torch.load(PATH, map_location=device)
+
+    # if model was trained using DataParallel then we have to load it differently
+    if "DataParallel" in args.lrp_load_model:
+        state_dict = torch.load(PATH, map_location=device)
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:] # remove module.
+            new_state_dict[name] = v
+            # print('name is:', name)
+        state_dict=new_state_dict
+
+    model.load_state_dict(state_dict)
+    model.to(device)
+
+    if args.explain:
+        model.eval()
+        print(model)
+
+        # create some hooks to retrieve intermediate activations
+        activation = {}
+        hooks={}
+
+        def get_activation(name):
+            def hook(model, input, output):
+                activation[name] = input[0]
+            return hook
+
+        for name, module in model.named_modules():
+            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
+                hooks[name] = module.register_forward_hook(get_activation("." + name))
+
+        for i, batch in enumerate(test_loader):
+
+            if multi_gpu:
+                X = batch
+            else:
+                X = batch.to(device)
+
+            if i==0:
+                # code can be written better
+                # basically i run at least one forward pass to get the activations to use their shape in defining the lrp layers
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
+                model = model_io(device, model, state_dict, dict(), activation)
+                explainer_reg = lrp_reg(device, model)
+                explainer_clf = lrp_clf(device, model)
+
+            else:
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
+
+            if not osp.isdir(outpath + '/lrp'):
+                os.makedirs(outpath + '/lrp')
+
+            if (not args.lrp_reg) & (not args.lrp_clf):
+                print('EXITING: Did not specefy wether to explain lrp_reg or lrp_clf')
+                sys.exit(0)
+
+            if args.lrp_reg:
+                print('Explaining the p4 predictions:')
+                to_explain_reg = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
+                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
+                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
+                                 "outpath": args.lrp_outpath, "load_model": args.lrp_load_model}
+
+                model.set_dest(to_explain_reg["A"])
+
+                big_list_reg = explainer_reg.explain(to_explain_reg)
+                torch.save(big_list_reg, outpath + '/lrp/big_list_reg.pt')
+                torch.save(to_explain_reg, outpath + '/lrp/to_explain_reg.pt')
+
+            if args.lrp_clf:
+                print('Explaining the pid predictions:')
+                to_explain_clf = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
+                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
+                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
+                                 "outpath": args.lrp_outpath, "load_model": args.lrp_load_model}
+
+                model.set_dest(to_explain_clf["A"])
+
+                big_list_clf = explainer_clf.explain(to_explain_clf)
+
+                torch.save(big_list_clf, outpath + '/lrp/big_list_clf.pt')
+                torch.save(to_explain_clf, outpath + '/lrp/to_explain_clf.pt')
+
+            break # explain only one single event
+
+    if args.make_heatmaps_reg:
+        # load the necessary R-scores
+        big_list_reg = torch.load(outpath + '/lrp/big_list_reg.pt', map_location=device)
+        to_explain_reg = torch.load(outpath + '/lrp/to_explain_reg.pt', map_location=device)
+
+        make_heatmaps(big_list_reg, to_explain_reg, device, outpath, output_dim_id, output_dim_p4, 'regression')
+
+    if args.make_heatmaps_clf:
+        # load the necessary R-scores
+        big_list_clf = torch.load(outpath + '/lrp/big_list_clf.pt', map_location=device)
+        to_explain_clf = torch.load(outpath + '/lrp/to_explain_clf.pt', map_location=device)
+
+        make_heatmaps(big_list_clf, to_explain_clf, device, outpath, output_dim_id, output_dim_p4, 'classification')
+
+# # ------------------------------------------------------------------------------------------------
+# # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:
+# print(R16[0].sum(axis=1)[0])
+# print(R15[0].sum(axis=1)[0])
+# print(R14[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R12[0].sum(axis=1)[0])
+# print(R11[0].sum(axis=1)[0])
+# print(R10[0].sum(axis=1)[0])
+# print(R9[0].sum(axis=1)[0])
+# print(R8[0].sum(axis=1)[0])
+# print(R_score_layer_before_msg_passing[0][0].sum(axis=0).sum())
+# print(R7[0][0].sum(axis=0).sum())
+# print(R6[0][0].sum(axis=1).sum())
+# print(R5[0][0].sum(axis=1).sum())
+# print(R4[0][0].sum(axis=1).sum())
+# print(R3[0][0].sum(axis=1).sum())
+# print(R2[0][0].sum(axis=1).sum())
+# print(R1[0][0].sum(axis=1).sum())
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 742e9b99b..0c80dff1b 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -1,3 +1,8 @@
+try:
+    import comet_ml
+except ModuleNotFoundError as e:
+    print("comet_ml not found, ignoring")
+
 import sys
 import os
 import yaml
@@ -11,10 +16,15 @@
 import click
 from tqdm import tqdm
 import shutil
+from functools import partial
+import shlex
+import subprocess
+import matplotlib.pyplot as plt
 
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
 import tensorflow_addons as tfa
+import keras_tuner as kt
 
 from tfmodel.data import Dataset
 from tfmodel.model_setup import (
@@ -23,12 +33,14 @@
     LearningRateLoggingCallback,
     prepare_callbacks,
     FlattenedCategoricalAccuracy,
+    SingleClassRecall,
     eval_model,
     freeze_model,
 )
 
 from tfmodel.utils import (
     get_lr_schedule,
+    get_optimizer,
     create_experiment_dir,
     get_strategy,
     make_weight_function,
@@ -44,10 +56,20 @@
     parse_config,
     get_best_checkpoint,
     delete_all_but_best_checkpoint,
+    get_tuner,
+    get_raytune_schedule,
 )
 
-from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.lr_finder import LRFinder
+from tfmodel.callbacks import CustomTensorBoard
+from tfmodel import hypertuning
+
+import ray
+from ray import tune
+from ray.tune.integration.keras import TuneReportCheckpointCallback
+from ray.tune.integration.tensorflow import DistributedTrainableCreator
+from ray.tune.logger import TBXLoggerCallback
+from ray.tune import Analysis
 
 
 @click.group()
@@ -56,42 +78,114 @@ def main():
     pass
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("--customize", help="customization function", type=str, default=None)
+def data(config, customize):
+
+    config, _, _, _, _, _, _ = parse_config(config)
+
+    if customize:
+        config = customization_functions[customize](config)
+
+    cds = config["dataset"]
+
+    dataset_def = Dataset(
+        num_input_features=int(cds["num_input_features"]),
+        num_output_features=int(cds["num_output_features"]),
+        padded_num_elem_size=int(cds["padded_num_elem_size"]),
+        raw_path=cds.get("raw_path"),
+        raw_files=cds.get("raw_files", None),
+        processed_path=cds.get("processed_path"),
+        validation_file_path=cds["validation_file_path"],
+        schema=cds["schema"]
+    )
+
+    dataset_def.process(
+        config["dataset"]["num_files_per_chunk"]
+    )
+        
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
 @click.option("--ntrain", default=None, help="override the number of training events", type=int)
 @click.option("--ntest", default=None, help="override the number of testing events", type=int)
+@click.option("--nepochs", default=None, help="override the number of training epochs", type=int)
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
-def train(config, weights, ntrain, ntest, recreate, prefix):
+@click.option("--plot-freq", default=None, help="plot detailed validation every N epochs", type=int)
+@click.option("--customize", help="customization function", type=str, default=None)
+def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq, customize):
+
+    try:
+        from comet_ml import Experiment
+        experiment = Experiment(
+            project_name="particleflow-tf",
+            auto_metric_logging=True,
+            auto_param_logging=True,
+            auto_histogram_weight_logging=True,
+            auto_histogram_gradient_logging=False,
+            auto_histogram_activation_logging=False,
+        )
+    except Exception as e:
+        print("Failed to initialize comet-ml dashboard")
+        experiment = None
+
+
     """Train a model defined by config"""
     config_file_path = config
     config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
         config, ntrain, ntest, weights
     )
+    if nepochs:
+        n_epochs = nepochs
+    if plot_freq:
+        config["callbacks"]["plot_freq"] = plot_freq
 
-    dataset_def = get_dataset_def(config)
-    ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
-    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
+    if customize:
+        prefix += customize + "_"
+        config = customization_functions[customize](config)
+        #FIXME: refactor this
+        global_batch_size = config["setup"]["batch_size"]
 
     if recreate or (weights is None):
         outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
     else:
         outdir = str(Path(weights).parent)
-    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
-    # If using more than 1 GPU, we scale the batch size by the number of GPUs
+    if "CPU" not in strategy.extended.worker_devices[0]:
+        nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
+        p = subprocess.Popen(shlex.split(nvidia_smi_call))
+    # If using more than 1 GPU, we scale the batch size by the number of GPUs before the dataset is loaded
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
+
+    dataset_def = get_dataset_def(config)
+    ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
+
+    #FIXME: split up training/test and validation dataset and parameters
+    dataset_def.padded_num_elem_size = 6400
+
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
+
+    if experiment:
+        experiment.set_name(outdir)
+        experiment.log_code("mlpf/tfmodel/model.py")
+        experiment.log_code("mlpf/tfmodel/utils.py")
+        experiment.log_code(config_file_path)
+
+    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
+
     total_steps = n_epochs * n_train // global_batch_size
-    lr = float(config["setup"]["lr"])
 
     with strategy.scope():
-        lr_schedule, optim_callbacks = get_lr_schedule(config, lr=lr, steps=total_steps)
-        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+        lr_schedule, optim_callbacks = get_lr_schedule(config, steps=total_steps)
+        opt = get_optimizer(config, lr_schedule)
 
         if config["setup"]["dtype"] == "float16":
             model_dtype = tf.dtypes.float16
@@ -105,7 +199,11 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
         # Run model once to build the layers
         print(X_val.shape)
-        model(tf.cast(X_val[:1], model_dtype))
+        
+        if config["tensorflow"]["eager"]:
+            model(X_val[:1])
+        else:
+            model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
         initial_epoch = 0
         if weights:
@@ -113,11 +211,16 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
             configure_model_weights(model, config["setup"].get("weights_config", "all"))
             model.load_weights(weights, by_name=True)
             initial_epoch = int(weights.split("/")[-1].split("-")[1])
-        model(tf.cast(X_val[:1], model_dtype))
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
         config = set_config_loss(config, config["setup"]["trainable"])
         configure_model_weights(model, config["setup"]["trainable"])
-        model(tf.cast(X_val[:1], model_dtype))
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
+
+        print("model weights")
+        tw_names = [m.name for m in model.trainable_weights]
+        for w in model.weights:
+            print("layer={} trainable={} shape={} num_weights={}".format(w.name, w.name in tw_names, w.shape, np.prod(w.shape)))
 
         loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
@@ -129,47 +232,54 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
                 "cls": [
                     FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
                     FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ] + [
+                    SingleClassRecall(
+                        icls,
+                        name="rec_cls{}".format(icls),
+                        dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"])
                 ]
             },
         )
         model.summary()
 
-        callbacks = prepare_callbacks(
-            model,
-            outdir,
-            X_val[: config["setup"]["batch_size"]],
-            ycand_val[: config["setup"]["batch_size"]],
-            dataset_transform,
-            config["dataset"]["num_output_classes"],
-        )
-        callbacks.append(optim_callbacks)
-
-        fit_result = model.fit(
-            ds_train_r,
-            validation_data=ds_test_r,
-            epochs=initial_epoch + n_epochs,
-            callbacks=callbacks,
-            steps_per_epoch=n_train // global_batch_size,
-            validation_steps=n_test // global_batch_size,
-            initial_epoch=initial_epoch,
-        )
-        history_path = Path(outdir) / "history"
-        history_path = str(history_path)
-        with open("{}/history.json".format(history_path), "w") as fi:
-            json.dump(fit_result.history, fi)
-        model.save(outdir + "/model_full", save_format="tf")
+    validation_particles = None
+    if config["dataset"]["target_particles"] == "cand":
+        validation_particles = ycand_val
+    elif config["dataset"]["target_particles"] == "gen":
+        validation_particles = ygen_val
+
+    callbacks = prepare_callbacks(
+        config["callbacks"],
+        outdir,
+        X_val,
+        validation_particles,
+        dataset_transform,
+        config["dataset"]["num_output_classes"],
+        dataset_def,
+        experiment
+    )
+    callbacks.append(optim_callbacks)
+
+    fit_result = model.fit(
+        ds_train_r,
+        validation_data=ds_test_r,
+        epochs=initial_epoch + n_epochs,
+        callbacks=callbacks,
+        steps_per_epoch=n_train // global_batch_size,
+        validation_steps=n_test // global_batch_size,
+        initial_epoch=initial_epoch,
+    )
 
-        print("Training done.")
+    history_path = Path(outdir) / "history"
+    history_path = str(history_path)
+    with open("{}/history.json".format(history_path), "w") as fi:
+        json.dump(fit_result.history, fi)
+    model.save(outdir + "/model_full", save_format="tf")
 
-        print("Starting evaluation...")
-        eval_dir = Path(outdir) / "evaluation"
-        eval_dir.mkdir()
-        eval_dir = str(eval_dir)
-        # TODO: change to use the evaluate() function below instead of eval_model()
-        eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
-        print("Evaluation done.")
+    print("Training done.")
 
-        freeze_model(model, config, outdir)
+    if "CPU" not in strategy.extended.worker_devices[0]:
+        p.terminate()
 
 
 @main.command()
@@ -178,7 +288,8 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
 @click.option("-e", "--evaluation_dir", help="optionally specify evaluation output dir", type=click.Path())
-def evaluate(config, train_dir, weights, evaluation_dir):
+@click.option("-v", "--validation_files", help="optionally override validation file path", type=click.Path(), default=None)
+def evaluate(config, train_dir, weights, evaluation_dir, validation_files):
     """Evaluate the trained model in train_dir"""
     if config is None:
         config = Path(train_dir) / "config.yaml"
@@ -202,33 +313,31 @@ def evaluate(config, train_dir, weights, evaluation_dir):
         model_dtype = tf.dtypes.float32
 
     dataset_def = get_dataset_def(config)
-    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
-    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
-    if maybe_global_batch_size is not None:
-        global_batch_size = maybe_global_batch_size
+    if not (validation_files is None):
+        dataset_def.val_filelist = glob.glob(str(validation_files))
 
-    with strategy.scope():
-        model = make_model(config, model_dtype)
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
-        # Evaluate model once to build the layers
-        print(X_val.shape)
-        model(tf.cast(X_val[:1], model_dtype))
+    model = make_model(config, model_dtype)
 
-        # need to load the weights in the same trainable configuration as the model was set up
-        configure_model_weights(model, config["setup"].get("weights_config", "all"))
-        if weights:
-            model.load_weights(weights, by_name=True)
-        else:
-            weights = get_best_checkpoint(train_dir)
-            print("Loading best weights that could be found from {}".format(weights))
-            model.load_weights(weights, by_name=True)
-        model(tf.cast(X_val[:1], model_dtype))
+    # Evaluate model once to build the layers
+    print(X_val.shape)
+    model(tf.cast(X_val[:1], model_dtype))
 
-        model.compile()
-        eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
-        freeze_model(model, config, train_dir)
+    # need to load the weights in the same trainable configuration as the model was set up
+    configure_model_weights(model, config["setup"].get("weights_config", "all"))
+    if weights:
+        model.load_weights(weights, by_name=True)
+    else:
+        weights = get_best_checkpoint(train_dir)
+        print("Loading best weights that could be found from {}".format(weights))
+        model.load_weights(weights, by_name=True)
+    model(tf.cast(X_val[:1], model_dtype))
 
+    model.compile()
+    eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
+    freeze_model(model, config, train_dir)
 
 @main.command()
 @click.help_option("-h", "--help")
@@ -299,6 +408,26 @@ def find_lr(config, outdir, figname, logscale):
         lr_finder.plot(save_dir=outdir, figname=figname, log_scale=logscale)
 
 
+def customize_gun_sample(config):
+
+    #FIXME: must be at least 2x bin_size
+    config["dataset"]["padded_num_elem_size"] = 1280
+
+    config["dataset"]["processed_path"] = "data/SinglePiFlatPt0p7To10_cfi/tfr_cand/*.tfrecords"
+    config["dataset"]["raw_path"] = "data/SinglePiFlatPt0p7To10_cfi/raw/*.pkl*"
+    config["dataset"]["classification_loss_coef"] = 0.0
+    config["dataset"]["charge_loss_coef"] = 0.0
+    config["dataset"]["eta_loss_coef"] = 0.0
+    config["dataset"]["sin_phi_loss_coef"] = 0.0
+    config["dataset"]["cos_phi_loss_coef"] = 0.0
+    config["setup"]["trainable"] = "regression"
+    config["setup"]["batch_size"] = 10*config["setup"]["batch_size"]
+    return config
+
+customization_functions = {
+    "gun_sample": customize_gun_sample
+}
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-t", "--train_dir", help="training directory", type=click.Path())
@@ -308,5 +437,322 @@ def delete_all_but_best_ckpt(train_dir, dry_run):
     delete_all_but_best_checkpoint(train_dir, dry_run)
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-o", "--outdir", help="output dir", type=click.Path())
+@click.option("--ntrain", default=None, help="override the number of training events", type=int)
+@click.option("--ntest", default=None, help="override the number of testing events", type=int)
+@click.option("-r", "--recreate", help="overwrite old hypertune results", is_flag=True, default=False)
+def hypertune(config, outdir, ntrain, ntest, recreate):
+    config_file_path = config
+    config, _, global_batch_size, n_train, n_test, n_epochs, _ = parse_config(config, ntrain, ntest)
+
+    # Override number of epochs with max_epochs from Hyperband config if specified
+    if config["hypertune"]["algorithm"] == "hyperband":
+        n_epochs = config["hypertune"]["hyperband"]["max_epochs"]
+
+    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+    if maybe_global_batch_size is not None:
+        global_batch_size = maybe_global_batch_size
+    total_steps = n_epochs * n_train // global_batch_size
+
+    model_builder, optim_callbacks = hypertuning.get_model_builder(config, total_steps)
+
+    dataset_def = get_dataset_def(config)
+    ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
+
+    #FIXME: split up training/test and validation dataset and parameters
+    dataset_def.padded_num_elem_size = 6400
+
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=True)
+
+    validation_particles = None
+    if config["dataset"]["target_particles"] == "cand":
+        validation_particles = ycand_val
+    elif config["dataset"]["target_particles"] == "gen":
+        validation_particles = ygen_val
+
+    callbacks = prepare_callbacks(
+        config["callbacks"],
+        outdir,
+        X_val,
+        validation_particles,
+        dataset_transform,
+        config["dataset"]["num_output_classes"],
+        dataset_def,
+    )
+
+    callbacks.append(optim_callbacks)
+    callbacks.append(tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss'))
+
+    tuner = get_tuner(config["hypertune"], model_builder, outdir, recreate, strategy)
+    tuner.search_space_summary()
+
+    tuner.search(
+        ds_train_r,
+        epochs=n_epochs,
+        validation_data=ds_test_r,
+        steps_per_epoch=n_train // global_batch_size,
+        validation_steps=n_test // global_batch_size,
+        #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
+        callbacks=callbacks,
+    )
+    print("Hyperparameter search complete.")
+    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
+
+    tuner.results_summary()
+    for trial in tuner.oracle.get_best_trials(num_trials=10):
+        print(trial.hyperparameters.values, trial.score)
+
+
+def set_raytune_search_parameters(search_space, config):
+    config["parameters"]["combined_graph_layer"]["layernorm"] = search_space["layernorm"]
+    config["parameters"]["combined_graph_layer"]["hidden_dim"] = search_space["hidden_dim"]
+    config["parameters"]["combined_graph_layer"]["distance_dim"] = search_space["distance_dim"]
+    config["parameters"]["combined_graph_layer"]["num_node_messages"] = search_space["num_node_messages"]
+    config["parameters"]["combined_graph_layer"]["node_message"]["normalize_degrees"] = search_space["normalize_degrees"]
+    config["parameters"]["combined_graph_layer"]["node_message"]["output_dim"] = search_space["output_dim"]
+    config["parameters"]["num_graph_layers_common"] = search_space["num_graph_layers_common"]
+    config["parameters"]["num_graph_layers_energy"] = search_space["num_graph_layers_energy"]
+    config["parameters"]["combined_graph_layer"]["dropout"] = search_space["dropout"]
+    config["parameters"]["combined_graph_layer"]["bin_size"] = search_space["bin_size"]
+    config["parameters"]["combined_graph_layer"]["kernel"]["clip_value_low"] = search_space["clip_value_low"]
+
+
+    config["setup"]["lr"] = search_space["lr"]
+    config["setup"]["batch_size"] = search_space["batch_size"]
+
+    config["exponentialdecay"]["decay_steps"] = search_space["expdecay_decay_steps"]
+    return config
+
+
+def build_model_and_train(config, checkpoint_dir=None, full_config=None):
+        full_config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(full_config)
+
+        if config is not None:
+            full_config = set_raytune_search_parameters(search_space=config, config=full_config)
+
+        strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+        if maybe_global_batch_size is not None:
+            global_batch_size = maybe_global_batch_size
+        total_steps = n_epochs * n_train // global_batch_size
+
+        ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(full_config, global_batch_size, n_train, n_test)
+
+        dataset_def = get_dataset_def(full_config)
+
+        #FIXME: split up training/test and validation dataset and parameters
+        dataset_def.padded_num_elem_size = 6400
+
+        X_val, ygen_val, ycand_val = prepare_val_data(full_config, dataset_def, single_file=True)
+
+        validation_particles = None
+        if full_config["dataset"]["target_particles"] == "cand":
+            validation_particles = ycand_val
+        elif full_config["dataset"]["target_particles"] == "gen":
+            validation_particles = ygen_val
+
+        callbacks = prepare_callbacks(
+            full_config["callbacks"],
+            tune.get_trial_dir(),
+            X_val,
+            validation_particles,
+            dataset_transform,
+            full_config["dataset"]["num_output_classes"],
+            dataset_def,
+        )
+
+        with strategy.scope():
+            lr_schedule, optim_callbacks = get_lr_schedule(full_config, steps=total_steps)
+            callbacks.append(optim_callbacks)
+            opt = get_optimizer(full_config, lr_schedule)
+
+            model = make_model(full_config, dtype=tf.dtypes.float32)
+
+            # Run model once to build the layers
+            model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
+
+            full_config = set_config_loss(full_config, full_config["setup"]["trainable"])
+            configure_model_weights(model, full_config["setup"]["trainable"])
+            model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
+
+            loss_dict, loss_weights = get_loss_dict(full_config)
+            model.compile(
+                loss=loss_dict,
+                optimizer=opt,
+                sample_weight_mode="temporal",
+                loss_weights=loss_weights,
+                metrics={
+                    "cls": [
+                        FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                        FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                    ]
+                },
+            )
+            model.summary()
+
+
+            callbacks.append(TuneReportCheckpointCallback(
+                metrics=[
+                    "adam_beta_1",
+                    'charge_loss',
+                    "cls_acc_unweighted",
+                    "cls_loss",
+                    "cos_phi_loss",
+                    "energy_loss",
+                    "eta_loss",
+                    "learning_rate",
+                    "loss",
+                    "pt_loss",
+                    "sin_phi_loss",
+                    "val_charge_loss",
+                    "val_cls_acc_unweighted",
+                    "val_cls_acc_weighted",
+                    "val_cls_loss",
+                    "val_cos_phi_loss",
+                    "val_energy_loss",
+                    "val_eta_loss",
+                    "val_loss",
+                    "val_pt_loss",
+                    "val_sin_phi_loss",
+                    ],
+                ),
+            )
+
+            fit_result = model.fit(
+                ds_train_r,
+                validation_data=ds_test_r,
+                epochs=n_epochs,
+                callbacks=callbacks,
+                steps_per_epoch=n_train // global_batch_size,
+                validation_steps=n_test // global_batch_size,
+            )
+
+
+def get_hp_str(result):
+    def func(key):
+        if "config" in key:
+            return key.split("config/")[-1]
+    s = ""
+    for ii, hp in enumerate(list(filter(None.__ne__, [func(key) for key in result.keys()]))):
+        if ii % 6 == 0:
+            s += "\n"
+        s += "{}={}; ".format(hp, result["config/{}".format(hp)].values[0])
+    return s
+
+def plot_ray_analysis(analysis, save=False, skip=0):
+    to_plot = [
+    #'adam_beta_1',
+       'charge_loss', 'cls_acc_unweighted', 'cls_loss',
+       'cos_phi_loss', 'energy_loss', 'eta_loss', 'learning_rate', 'loss',
+       'pt_loss', 'sin_phi_loss', 'val_charge_loss',
+       'val_cls_acc_unweighted', 'val_cls_acc_weighted', 'val_cls_loss',
+       'val_cos_phi_loss', 'val_energy_loss', 'val_eta_loss', 'val_loss',
+       'val_pt_loss', 'val_sin_phi_loss',
+    ]
+
+    dfs = analysis.fetch_trial_dataframes()
+    result_df = analysis.dataframe()
+    for key in tqdm(dfs.keys(), desc="Creating Ray analysis plots", total=len(dfs.keys())):
+        result = result_df[result_df["logdir"] == key]
+
+        fig, axs = plt.subplots(5, 4, figsize=(12, 9), tight_layout=True)
+        for var, ax in zip(to_plot, axs.flat):
+            # Skip first `skip` values so loss plots don't include the very large losses which occur at start of training
+            ax.plot(dfs[key].index.values[skip:], dfs[key][var][skip:], alpha=0.8)
+            ax.set_xlabel("Epoch")
+            ax.set_ylabel(var)
+            ax.grid(alpha=0.3)
+        plt.suptitle(get_hp_str(result))
+
+        if save:
+            plt.savefig(key + "/trial_summary.jpg")
+            plt.close()
+    if not save:
+        plt.show()
+    else:
+        print("Saved plots in trial dirs.")
+
+
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-n", "--name", help="experiment name", type=str, default="test_exp")
+@click.option("-l", "--local", help="run locally", is_flag=True)
+@click.option("--cpus", help="number of cpus per worker", type=int, default=1)
+@click.option("--gpus", help="number of gpus per worker", type=int, default=0)
+@click.option("--tune_result_dir", help="Tune result dir", type=str, default=None)
+@click.option("-r", "--resume", help="resume run from local_dir", is_flag=True)
+def raytune(config, name, local, cpus, gpus, tune_result_dir, resume):
+    cfg = load_config(config)
+    config_file_path = config
+
+    if tune_result_dir is not None:
+        os.environ["TUNE_RESULT_DIR"] = tune_result_dir
+    else:
+        trd = cfg["raytune"]["local_dir"] + "/tune_result_dir"
+        os.environ["TUNE_RESULT_DIR"] = trd
+
+    if not local:
+        ray.init(address='auto')
+
+    search_space = {
+        # Optimizer parameters
+        "lr": tune.grid_search(cfg["raytune"]["parameters"]["lr"]),
+        "batch_size": tune.grid_search(cfg["raytune"]["parameters"]["batch_size"]),
+        "expdecay_decay_steps": tune.grid_search(cfg["raytune"]["parameters"]["expdecay_decay_steps"]),
+
+        # Model parameters
+        "layernorm": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["layernorm"]),
+        "hidden_dim": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["hidden_dim"]),
+        "distance_dim": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["distance_dim"]),
+        "num_node_messages": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["num_node_messages"]),
+        "num_graph_layers_common": tune.grid_search(cfg["raytune"]["parameters"]["num_graph_layers_common"]),
+        "num_graph_layers_energy": tune.grid_search(cfg["raytune"]["parameters"]["num_graph_layers_energy"]),
+        "dropout": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["dropout"]),
+        "bin_size": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["bin_size"]),
+        "clip_value_low": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["kernel"]["clip_value_low"]),
+        "normalize_degrees": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["node_message"]["normalize_degrees"]),
+        "output_dim": tune.grid_search(cfg["raytune"]["parameters"]["combined_graph_layer"]["node_message"]["output_dim"]),
+    }
+
+    sched = get_raytune_schedule(cfg["raytune"])
+
+    distributed_trainable = DistributedTrainableCreator(
+        partial(build_model_and_train, full_config=config_file_path),
+        num_workers=1,  # Number of hosts that each trial is expected to use.
+        num_cpus_per_worker=cpus,
+        num_gpus_per_worker=gpus,
+        num_workers_per_host=1,  # Number of workers to colocate per host. None if not specified.
+    )
+
+    analysis = tune.run(
+        distributed_trainable,
+        config=search_space,
+        name=name,
+        scheduler=sched,
+        num_samples=1,
+        local_dir=cfg["raytune"]["local_dir"],
+        callbacks=[TBXLoggerCallback()],
+        log_to_file=True,
+        resume=resume,
+    )
+    print("Best hyperparameters found were: ", analysis.get_best_config("val_loss", "min"))
+
+    plot_ray_analysis(analysis, save=True, skip=20)
+    ray.shutdown()
+
+
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-d", "--exp_dir", help="experiment dir", type=click.Path())
+@click.option("-s", "--save", help="save plots in trial dirs", is_flag=True)
+@click.option("-k", "--skip", help="skip first values to avoid large losses at start of training", type=int)
+def raytune_analysis(exp_dir, save, skip):
+    analysis = Analysis(exp_dir,  default_metric="val_loss", default_mode="min")
+    plot_ray_analysis(analysis, save=save, skip=skip)
+
+
 if __name__ == "__main__":
     main()
diff --git a/mlpf/plotting/__init__.py b/mlpf/plotting/__init__.py
new file mode 100644
index 000000000..61a447ad4
--- /dev/null
+++ b/mlpf/plotting/__init__.py
@@ -0,0 +1,2 @@
+from plotting.plot_utils import plot_confusion_matrix
+from plotting.plots import make_plots, plot_regression, plot_distributions_pid, plot_distributions_all, plot_pt_eta, plot_num_particles_pid, draw_efficiency_fakerate, get_eff, get_fake, plot_reso
diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py
index 6a1bb20ee..6e6400ac8 100644
--- a/mlpf/plotting/plot_utils.py
+++ b/mlpf/plotting/plot_utils.py
@@ -1,6 +1,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
-import mplhep
+import mplhep as hep
 import os.path as osp
 
 pid_to_text = {
@@ -147,8 +147,8 @@ def sample_label(ax, y=0.98):
 def particle_label(ax, pid):
     plt.text(0.03, 0.92, pid_to_text[pid], va="top", ha="left", size=10, transform=ax.transAxes)
 
-def plot_confusion_matrix(cm,
-                          target_names,
+def plot_confusion_matrix(cm, target_names,
+                          fname, epoch,
                           title='Confusion matrix',
                           cmap=None,
                           normalize=True):
@@ -187,9 +187,11 @@ def plot_confusion_matrix(cm,
     import matplotlib.pyplot as plt
     import numpy as np
     import itertools
+    plt.style.use('default')
 
-    accuracy = np.trace(cm) / float(np.sum(cm))
-    misclass = 1 - accuracy
+    # # only true if it weren't normalized:
+    # accuracy = np.trace(cm) / float(np.sum(cm))
+    # misclass = 1 - accuracy
 
     if cmap is None:
         cmap = plt.get_cmap('Blues')
@@ -201,7 +203,7 @@ def plot_confusion_matrix(cm,
     fig = plt.figure(figsize=(5, 4))
     ax = plt.axes()
     plt.imshow(cm, interpolation='nearest', cmap=cmap)
-    plt.title(title)
+    plt.title(title + ' at epoch ' + str(epoch))
     plt.colorbar()
 
     if target_names is not None:
@@ -220,12 +222,16 @@ def plot_confusion_matrix(cm,
                      horizontalalignment="center",
                      color="white" if cm[i, j] > thresh else "black")
 
-
     plt.ylabel('True label')
     plt.xlim(-1, len(target_names))
     plt.ylim(-1, len(target_names))
-    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
+    plt.xlabel('Predicted label')
+    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
     plt.tight_layout()
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
     return fig, ax
 
 
@@ -244,7 +250,7 @@ def plot_E_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='tar
         [bins["E_val"][0], bins["E_val"][-1]],
         color="black", ls="--", lw=0.5)
     plt.savefig(osp.join(outpath,"energy_2d_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     plt.figure(figsize=(4,4))
     ax = plt.axes()
     plt.hist(v0[msk_true, 0], bins=bins["E_val"], density=1.0, histtype="step", lw=2, label=bins["true_val"]);
@@ -257,7 +263,7 @@ def plot_E_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='tar
     particle_label(ax, pid)
     ax.set_ylim(ax.get_ylim()[0], 1.5*ax.get_ylim()[1])
     plt.savefig(osp.join(outpath,"energy_hist_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     ax.set_ylim(ax.get_ylim()[0], 1.2*ax.get_ylim()[1])
 
     res = (v0[msk_both, 1] - v0[msk_both, 0])/v0[msk_both, 0]
@@ -273,7 +279,7 @@ def plot_E_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='tar
     sample_label(ax)
     particle_label(ax, pid)
     plt.savefig(osp.join(outpath,"energy_ratio_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     #efficiency vs fake rate
     plt.figure(figsize=(4,4))
     ax = plt.axes()
@@ -328,7 +334,7 @@ def plot_eta_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
         [bins["eta_val"][0], bins["eta_val"][-1]],
         color="black", ls="--", lw=0.5)
     plt.savefig(osp.join(outpath,"eta_2d_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     plt.figure(figsize=(4,4))
     ax = plt.axes()
     plt.hist(v0[msk_true, 0], bins=bins["eta_val"], density=1.0, histtype="step", lw=2, label=bins["true_val"]);
@@ -341,7 +347,7 @@ def plot_eta_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
     particle_label(ax, pid)
     ax.set_ylim(ax.get_ylim()[0], 1.5*ax.get_ylim()[1])
     plt.savefig(osp.join(outpath,"eta_hist_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     ax.set_ylim(ax.get_ylim()[0], 1.2*ax.get_ylim()[1])
 
     res = (v0[msk_both, 1] - v0[msk_both, 0])
@@ -357,7 +363,7 @@ def plot_eta_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
     sample_label(ax)
     particle_label(ax, pid)
     plt.savefig(osp.join(outpath,"eta_ratio_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     #efficiency vs fake rate
     plt.figure(figsize=(4,4))
     ax = plt.axes()
@@ -412,7 +418,7 @@ def plot_phi_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
         [bins["phi_val"][0], bins["phi_val"][-1]],
         color="black", ls="--", lw=0.5)
     plt.savefig(osp.join(outpath,"phi_2d_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     plt.figure(figsize=(4,4))
     ax = plt.axes()
     plt.hist(v0[msk_true, 0], bins=bins["phi_val"], density=1.0, histtype="step", lw=2, label=bins["true_val"]);
@@ -439,7 +445,7 @@ def plot_phi_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
     sample_label(ax)
     particle_label(ax, pid)
     plt.savefig(osp.join(outpath,"phi_ratio_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     #efficiency vs fake rate
     plt.figure(figsize=(4,4))
     ax = plt.axes()
diff --git a/mlpf/plotting/plots.py b/mlpf/plotting/plots.py
new file mode 100644
index 000000000..09488d247
--- /dev/null
+++ b/mlpf/plotting/plots.py
@@ -0,0 +1,730 @@
+import sklearn
+import sklearn.metrics
+import numpy as np
+import pandas, mplhep
+import pickle as pkl
+import time, math
+
+import sys
+import os.path as osp
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+import matplotlib
+import matplotlib.pyplot as plt
+import mpl_toolkits
+import mplhep as hep
+
+import plotting
+
+plt.style.use(hep.style.ROOT)
+
+elem_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+class_labels = [0, 1, 2, 3, 4, 5]
+
+#map these to ids 0...Nclass
+class_to_id = {r: class_labels[r] for r in range(len(class_labels))}
+# map these to ids 0...Nclass
+elem_to_id = {r: elem_labels[r] for r in range(len(elem_labels))}
+
+sample_title_qcd = "QCD, 14 TeV, PU200"
+sample_title_ttbar = "$t\\bar{t}$, 14 TeV, PU200"
+
+ranges = {
+    "pt": np.linspace(0, 10, 61),
+    "eta": np.linspace(-5, 5, 61),
+    "sphi": np.linspace(-1, 1, 61),
+    "cphi": np.linspace(-1, 1, 61),
+    "energy": np.linspace(0, 100, 61)
+}
+pid_names = {
+    0: "Null",
+    1: "Charged hadrons",
+    2: "Neutral hadrons",
+    3: "Photons",
+    4: "Electrons",
+    5: "Muons",
+}
+key_to_pid = {
+    "null": 0,
+    "chhadron": 1,
+    "nhadron": 2,
+    "photon": 3,
+    "electron": 4,
+    "muon": 5,
+}
+var_names = {
+    "pt": r"$p_\mathrm{T}$ [GeV]",
+    "eta": r"$\eta$",
+    "sphi": r"$\mathrm{sin} \phi$",
+    "cphi": r"$\mathrm{cos} \phi$",
+    "energy": r"$E$ [GeV]"
+}
+var_names_nounit = {
+    "pt": r"$p_\mathrm{T}$",
+    "eta": r"$\eta$",
+    "sphi": r"$\mathrm{sin} \phi$",
+    "cphi": r"$\mathrm{cos} \phi$",
+    "energy": r"$E$"
+}
+var_names_bare = {
+    "pt": "p_\mathrm{T}",
+    "eta": "\eta",
+    "energy": "E",
+}
+var_indices = {
+    "pt": 2,
+    "eta": 3,
+    "sphi": 4,
+    "cphi": 5,
+    "energy": 6
+}
+
+def deltaphi(phi1, phi2):
+    return np.fmod(phi1 - phi2 + np.pi, 2*np.pi) - np.pi
+
+def mse_unreduced(true, pred):
+    return torch.square(true-pred)
+
+# computes accuracy of PID predictions given a one_hot_embedding: truth & pred
+def accuracy(true_id, pred_id):
+    # revert one_hot_embedding
+    _, true_id = torch.max(true_id, -1)
+    _, pred_id = torch.max(pred_id, -1)
+
+    is_true = (true_id !=0)
+    is_same = (true_id == pred_id)
+
+    acc = (is_same&is_true).sum() / is_true.sum()
+    return acc
+
+# computes the resolution given a one_hot_embedding truth & pred + p4 of truth & pred
+def energy_resolution(true_id, true_p4, pred_id, pred_p4):
+    # revert one_hot_embedding
+    _,true_id= torch.max(true_id, -1)
+    _,pred_id = torch.max(pred_id, -1)
+
+    msk = (true_id!=0)
+
+    return mse_unreduced(true_p4[msk], pred_p4[msk])
+
+def plot_regression(val_x, val_y, var_name, rng, target, fname):
+    fig = plt.figure(figsize=(5,5))
+    plt.hist2d(
+        val_x,
+        val_y,
+        bins=(rng, rng),
+        cmap="Blues",
+        #norm=matplotlib.colors.LogNorm()
+    );
+
+    if target=='cand':
+        plt.xlabel("Cand {}".format(var_name))
+    elif target=='gen':
+        plt.xlabel("Gen {}".format(var_name))
+
+    plt.ylabel("MLPF {}".format(var_name))
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
+    return fig
+
+def plot_particles(fname, true_id, true_p4, pred_id, pred_p4, pid=1):
+    #Ground truth vs model prediction particles
+    fig = plt.figure(figsize=(10,10))
+
+    true_p4 = true_p4.detach().cpu().numpy()
+    pred_p4 = pred_p4.detach().cpu().numpy()
+
+    msk = (true_id == pid)
+    plt.scatter(true_p4[msk, 2], np.arctan2(true_p4[msk, 3], true_p4[msk, 4]), s=2*true_p4[msk, 2], marker="o", alpha=0.5)
+
+    msk = (pred_id == pid)
+    plt.scatter(pred_p4[msk, 2], np.arctan2(pred_p4[msk, 3], pred_p4[msk, 4]), s=2*pred_p4[msk, 2], marker="o", alpha=0.5)
+
+    plt.xlabel("eta")
+    plt.ylabel("phi")
+    plt.xlim(-5,5)
+    plt.ylim(-4,4)
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
+    return fig
+
+def plot_distribution(pid, val_x, val_y, var_name, rng, target, fname, legend_title=""):
+    plt.style.use(mplhep.style.CMS)
+
+    fig = plt.figure(figsize=(10,10))
+
+    if target=='cand':
+        plt.hist(val_x, bins=rng, density=True, histtype="step", lw=2, label="cand");
+    elif target=='gen':
+        plt.hist(val_x, bins=rng, density=True, histtype="step", lw=2, label="gen");
+
+    plt.hist(val_y, bins=rng, density=True, histtype="step", lw=2, label="MLPF");
+    plt.xlabel(var_name)
+
+    if pid!=-1:
+        plt.legend(frameon=False, title=legend_title+pid_names[pid])
+    else:
+        plt.legend(frameon=False, title=legend_title)
+
+    plt.ylim(0,1.5)
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
+    return fig
+
+def plot_distributions_pid(pid, true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath, legend_title=""):
+    plt.style.use("default")
+
+    ch_true = true_p4[true_id==pid, 0].flatten().detach().cpu().numpy()
+    ch_pred = pred_p4[pred_id==pid, 0].flatten().detach().cpu().numpy()
+
+    pt_true = true_p4[true_id==pid, 1].flatten().detach().cpu().numpy()
+    pt_pred = pred_p4[pred_id==pid, 1].flatten().detach().cpu().numpy()
+
+    eta_true = true_p4[true_id==pid, 2].flatten().detach().cpu().numpy()
+    eta_pred = pred_p4[pred_id==pid, 2].flatten().detach().cpu().numpy()
+
+    sphi_true = true_p4[true_id==pid, 3].flatten().detach().cpu().numpy()
+    sphi_pred = pred_p4[pred_id==pid, 3].flatten().detach().cpu().numpy()
+
+    cphi_true = true_p4[true_id==pid, 4].flatten().detach().cpu().numpy()
+    cphi_pred = pred_p4[pred_id==pid, 4].flatten().detach().cpu().numpy()
+
+    e_true = true_p4[true_id==pid, 5].flatten().detach().cpu().numpy()
+    e_pred = pred_p4[pred_id==pid, 5].flatten().detach().cpu().numpy()
+
+    figure = plot_distribution(pid, ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_charge_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_pt_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_energy_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_eta_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_sphi_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_cphi_distribution', legend_title=legend_title)
+
+def plot_distributions_all(true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath, legend_title=""):
+    plt.style.use("default")
+
+    msk = (pred_id!=0) & (true_id!=0)
+
+    ch_true = true_p4[msk, 0].flatten().detach().cpu().numpy()
+    ch_pred = pred_p4[msk, 0].flatten().detach().cpu().numpy()
+
+    pt_true = true_p4[msk, 1].flatten().detach().cpu().numpy()
+    pt_pred = pred_p4[msk, 1].flatten().detach().cpu().numpy()
+
+    eta_true = true_p4[msk, 2].flatten().detach().cpu().numpy()
+    eta_pred = pred_p4[msk, 2].flatten().detach().cpu().numpy()
+
+    sphi_true = true_p4[msk, 3].flatten().detach().cpu().numpy()
+    sphi_pred = pred_p4[msk, 3].flatten().detach().cpu().numpy()
+
+    cphi_true = true_p4[msk, 4].flatten().detach().cpu().numpy()
+    cphi_pred = pred_p4[msk, 4].flatten().detach().cpu().numpy()
+
+    e_true = true_p4[msk, 5].flatten().detach().cpu().numpy()
+    e_pred = pred_p4[msk, 5].flatten().detach().cpu().numpy()
+
+    figure = plot_distribution(-1, ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_charge_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_pt_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/all_energy_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/all_eta_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_sphi_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_cphi_distribution', legend_title=legend_title)
+
+def midpoints(x):
+    return x[:-1] + np.diff(x)/2
+
+def mask_empty(hist):
+    h0 = hist[0].astype(np.float64)
+    h0[h0<50] = 0
+    return (h0, hist[1])
+
+def divide_zero(a, b):
+    a = a.astype(np.float64)
+    b = b.astype(np.float64)
+    out = np.zeros_like(a)
+    np.divide(a, b, where=b>0, out=out)
+    return out
+
+def plot_pt_eta(ygen, legend_title=""):
+    plt.style.use(hep.style.ROOT)
+
+    b = np.linspace(0, 100, 41)
+
+    msk_pid1 = (ygen[:, 0]==1)
+    msk_pid2 = (ygen[:, 0]==2)
+    msk_pid3 = (ygen[:, 0]==3)
+    msk_pid4 = (ygen[:, 0]==4)
+    msk_pid5 = (ygen[:, 0]==5)
+
+    h1 = np.histogram(ygen[msk_pid1, 2], bins=b)
+    h2 = np.histogram(ygen[msk_pid2, 2], bins=b)
+    h3 = np.histogram(ygen[msk_pid3, 2], bins=b)
+    h4 = np.histogram(ygen[msk_pid4, 2], bins=b)
+    h5 = np.histogram(ygen[msk_pid5, 2], bins=b)
+
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))
+
+    xs = midpoints(h1[1])
+    width = np.diff(h1[1])
+
+    hep.histplot([h5[0], h4[0], h3[0], h2[0], h1[0]], bins=h1[1], ax=ax1, stack=True, histtype="fill",
+        label=["Muons", "Electrons", "Photons", "Neutral hadrons", "Charged hadrons"])
+
+    ax1.legend(loc="best", frameon=False, title=legend_title)
+    ax1.set_yscale("log")
+    ax1.set_ylim(1e1, 1e9)
+    ax1.set_xlabel(r"Truth particle $p_\mathrm{T}$ [GeV]")
+    ax1.set_ylabel("Truth particles")
+
+    b = np.linspace(-8, 8, 41)
+    h1 = np.histogram(ygen[msk_pid1, 3], bins=b)
+    h2 = np.histogram(ygen[msk_pid2, 3], bins=b)
+    h3 = np.histogram(ygen[msk_pid3, 3], bins=b)
+    h4 = np.histogram(ygen[msk_pid4, 3], bins=b)
+    h5 = np.histogram(ygen[msk_pid5, 3], bins=b)
+    xs = midpoints(h1[1])
+    width = np.diff(h1[1])
+
+    hep.histplot([h5[0], h4[0], h3[0], h2[0], h1[0]], bins=h1[1], ax=ax2, stack=True, histtype="fill",
+        label=["Muons", "Electrons", "Photons", "Neutral hadrons", "Charged hadrons"])
+    leg = ax2.legend(loc="best", frameon=False, ncol=2, title=legend_title)
+    leg._legend_box.align = "left"
+    ax2.set_yscale("log")
+    ax2.set_ylim(1e1, 1e9)
+    ax2.set_xlabel("Truth particle $\eta$")
+    ax2.set_ylabel("Truth particles")
+    return ax1, ax2
+
+def plot_num_particles_pid(list, key, ax=None, legend_title=""):
+    plt.style.use(hep.style.ROOT)
+
+    pid = key_to_pid[key]
+    if not ax:
+        plt.figure(figsize=(4,4))
+        ax = plt.axes()
+
+    cand_list = list[0]
+    target_list = list[1]
+    pf_list = list[2]
+
+    a = np.array(pf_list[key])
+    b = np.array(target_list[key])
+
+    ratio_dpf = (a - b) / b
+    ratio_dpf[ratio_dpf > 10] = 10
+    ratio_dpf[ratio_dpf < -10] = -10
+    mu_dpf = np.mean(ratio_dpf)
+    sigma_dpf = np.std(ratio_dpf)
+
+    ax.scatter(
+        target_list[key],
+        cand_list[key],
+        marker="o",
+        label="Rule-based PF, $r={0:.3f}$\n$\mu={1:.3f}\\ \sigma={2:.3f}$".format(
+            np.corrcoef(a, b)[0,1], mu_dpf, sigma_dpf
+        ),
+        alpha=0.5
+    )
+
+    c = np.array(cand_list[key])
+    b = np.array(target_list[key])
+
+    ratio_mlpf = (c - b) / b
+    ratio_mlpf[ratio_mlpf > 10] = 10
+    ratio_mlpf[ratio_mlpf < -10] = -10
+    mu_mlpf = np.mean(ratio_mlpf)
+    sigma_mlpf = np.std(ratio_mlpf)
+
+    ax.scatter(
+        target_list[key],
+        cand_list[key],
+        marker="^",
+        label="MLPF, $r={0:.3f}$\n$\mu={1:.3f}\\ \sigma={2:.3f}$".format(
+            np.corrcoef(a, b)[0,1], mu_mlpf, sigma_mlpf
+        ),
+        alpha=0.5
+    )
+
+    lims = [
+        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
+        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
+    ]
+    # now plot both limits against each other
+    ax.plot(lims, lims, '--', alpha=0.75, zorder=0)
+    ax.set_aspect('equal')
+    ax.set_xlim(lims)
+    ax.set_ylim(lims)
+    plt.tight_layout()
+    ax.legend(frameon=False, title=legend_title+pid_names[pid])
+    ax.set_xlabel("Truth particles / event")
+    ax.set_ylabel("Reconstructed particles / event")
+    plt.title("Particle multiplicity")
+
+def draw_efficiency_fakerate(ygen, ypred, ycand, pid, var, bins, outpath, both=True, legend_title=""):
+    var_idx = var_indices[var]
+
+    msk_gen = ygen[:, 0]==pid
+    msk_pred = ypred[:, 0]==pid
+    msk_cand = ycand[:, 0]==pid
+
+    hist_gen = np.histogram(ygen[msk_gen, var_idx], bins=bins);
+    hist_cand = np.histogram(ygen[msk_gen & msk_cand, var_idx], bins=bins);
+    hist_pred = np.histogram(ygen[msk_gen & msk_pred, var_idx], bins=bins);
+
+    hist_gen = mask_empty(hist_gen)
+    hist_cand = mask_empty(hist_cand)
+    hist_pred = mask_empty(hist_pred)
+
+    #efficiency plot
+    if both:
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))
+    else:
+        fig, ax1 = plt.subplots(1, 1, figsize=(8, 1*8))
+        ax2 = None
+
+    #ax1.set_title("reco efficiency for {}".format(pid_names[pid]))
+    ax1.errorbar(
+        midpoints(hist_gen[1]),
+        divide_zero(hist_cand[0], hist_gen[0]),
+        divide_zero(np.sqrt(hist_gen[0]), hist_gen[0]) * divide_zero(hist_cand[0], hist_gen[0]),
+        lw=0, label="Rule-based PF", elinewidth=2, marker=".",markersize=10)
+    ax1.errorbar(
+        midpoints(hist_gen[1]),
+        divide_zero(hist_pred[0], hist_gen[0]),
+        divide_zero(np.sqrt(hist_gen[0]), hist_gen[0]) * divide_zero(hist_pred[0], hist_gen[0]),
+        lw=0, label="MLPF", elinewidth=2, marker=".",markersize=10)
+    ax1.legend(frameon=False, loc=0, title=legend_title+pid_names[pid])
+    ax1.set_ylim(0,1.2)
+    # if var=="energy":
+    #     ax1.set_xlim(0,30)
+    ax1.set_xlabel(var_names[var])
+    ax1.set_ylabel("Efficiency")
+
+    hist_cand2 = np.histogram(ygen[msk_cand & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred2 = np.histogram(ygen[msk_pred & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_cand_gen2 = np.histogram(ygen[msk_cand & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred_gen2 = np.histogram(ygen[msk_pred & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+
+    hist_cand2 = mask_empty(hist_cand2)
+    hist_cand_gen2 = mask_empty(hist_cand_gen2)
+    hist_pred2 = mask_empty(hist_pred2)
+    hist_pred_gen2 = mask_empty(hist_pred_gen2)
+
+    if both:
+        #fake rate plot
+        #ax2.set_title("reco fake rate for {}".format(pid_names[pid]))
+        ax2.errorbar(
+            midpoints(hist_cand2[1]),
+            divide_zero(hist_cand_gen2[0], hist_cand2[0]),
+            divide_zero(np.sqrt(hist_cand_gen2[0]), hist_cand2[0]),
+            lw=0, label="Rule-based PF", elinewidth=2, marker=".",markersize=10)
+        ax2.errorbar(
+            midpoints(hist_pred2[1]),
+            divide_zero(hist_pred_gen2[0], hist_pred2[0]),
+            divide_zero(np.sqrt(hist_pred_gen2[0]), hist_pred2[0]),
+            lw=0, label="MLPF", elinewidth=2, marker=".",markersize=10)
+        ax2.legend(frameon=False, loc=0, title=legend_title+pid_names[pid])
+        ax2.set_ylim(0, 1.0)
+        #plt.yscale("log")
+        ax2.set_xlabel(var_names[var])
+        ax2.set_ylabel("Fake rate")
+
+    plt.savefig(outpath, bbox_inches="tight")
+    plt.close(fig)
+
+    return ax1, ax2
+
+def get_eff(ygen, ypred, ycand):
+    msk_gen = (ygen[:, 0]==pid) & (ygen[:, var_indices["pt"]]>5.0)
+    msk_pred = ypred[:, 0]==pid
+    msk_cand = ycand[:, 0]==pid
+
+    hist_gen = np.histogram(ygen[msk_gen, var_idx], bins=bins);
+    hist_cand = np.histogram(ygen[msk_gen & msk_cand, var_idx], bins=bins);
+    hist_pred = np.histogram(ygen[msk_gen & msk_pred, var_idx], bins=bins);
+
+    hist_gen = mask_empty(hist_gen)
+    hist_cand = mask_empty(hist_cand)
+    hist_pred = mask_empty(hist_pred)
+
+    return {
+        "x": midpoints(hist_gen[1]),
+        "y": divide_zero(hist_pred[0], hist_gen[0]),
+        "yerr": divide_zero(np.sqrt(hist_gen[0]), hist_gen[0]) * divide_zero(hist_pred[0], hist_gen[0])
+    }
+
+def get_fake(ygen, ypred, ycand):
+    msk_gen = ygen[:, 0]==pid
+    msk_pred = ypred[:, 0]==pid
+    msk_cand = ycand[:, 0]==pid
+
+    hist_cand2 = np.histogram(ygen[msk_cand & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred2 = np.histogram(ygen[msk_pred & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_cand_gen2 = np.histogram(ygen[msk_cand & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred_gen2 = np.histogram(ygen[msk_pred & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+
+    hist_cand2 = mask_empty(hist_cand2)
+    hist_cand_gen2 = mask_empty(hist_cand_gen2)
+    hist_pred2 = mask_empty(hist_pred2)
+    hist_pred_gen2 = mask_empty(hist_pred_gen2)
+
+    return {
+        "x": midpoints(hist_pred2[1]),
+        "y": divide_zero(hist_pred_gen2[0], hist_pred2[0]),
+        "yerr": divide_zero(np.sqrt(hist_pred_gen2[0]), hist_pred2[0])
+    }
+
+def plot_reso(ygen, ypred, ycand, pid, var, rng, ax=None, legend_title=""):
+    plt.style.use(hep.style.ROOT)
+
+    var_idx = var_indices[var]
+    msk = (ygen[:, 0]==pid) & (ycand[:, 0]==pid)
+    bins = np.linspace(-rng, rng, 100)
+    yg = ygen[msk, var_idx]
+    yp = ypred[msk, var_idx]
+
+    yc = ycand[msk, var_idx]
+    ratio_mlpf = (yp - yg) / yg
+    ratio_dpf = (yc - yg) / yg
+
+    #remove outliers for std value computation
+    outlier = 10
+    ratio_mlpf[ratio_mlpf<-outlier] = -outlier
+    ratio_mlpf[ratio_mlpf>outlier] = outlier
+    ratio_dpf[ratio_dpf<-outlier] = -outlier
+    ratio_dpf[ratio_dpf>outlier] = outlier
+
+    res_dpf = np.mean(ratio_dpf), np.std(ratio_dpf)
+    res_mlpf = np.mean(ratio_mlpf), np.std(ratio_mlpf)
+
+    if ax is None:
+        plt.figure(figsize=(4, 4))
+        ax = plt.axes()
+
+    #plt.title("{} resolution for {}".format(var_names_nounit[var], pid_names[pid]))
+    ax.hist(ratio_dpf, bins=bins, histtype="step", lw=2, label="Rule-based PF\n$\mu={:.2f},\\ \sigma={:.2f}$".format(*res_dpf));
+    ax.hist(ratio_mlpf, bins=bins, histtype="step", lw=2, label="MLPF\n$\mu={:.2f},\\ \sigma={:.2f}$".format(*res_mlpf));
+    ax.legend(frameon=False, title=legend_title+pid_names[pid])
+    ax.set_xlabel("{nounit} resolution, $({bare}^\prime - {bare})/{bare}$".format(nounit=var_names_nounit[var],bare=var_names_bare[var]))
+    ax.set_ylabel("Particles")
+    #plt.ylim(0, ax.get_ylim()[1]*2)
+    ax.set_ylim(1, 1e10)
+    ax.set_yscale("log")
+
+    return {"dpf": res_dpf, "mlpf": res_mlpf}
+
+def make_plots(model, test_loader, outpath, target, device, epoch, which_data):
+
+    print('Making plots on ' + which_data)
+    t0=time.time()
+
+    # load the necessary predictions to make the plots
+    gen_ids = torch.load(outpath + f'/gen_ids.pt', map_location=device)
+    gen_p4 = torch.load(outpath + f'/gen_p4.pt', map_location=device)
+    pred_ids = torch.load(outpath + f'/pred_ids.pt', map_location=device)
+    pred_p4 = torch.load(outpath + f'/pred_p4.pt', map_location=device)
+    cand_ids = torch.load(outpath + f'/cand_ids.pt', map_location=device)
+    cand_p4 = torch.load(outpath + f'/cand_p4.pt', map_location=device)
+
+    list_for_multiplicities = torch.load(outpath + f'/list_for_multiplicities.pt', map_location=device)
+
+    predictions = torch.load(outpath + f'/predictions.pt', map_location=device)
+
+    # reformat a bit
+    ygen = predictions["ygen"].reshape(-1,7)
+    ypred = predictions["ypred"].reshape(-1,7)
+    ycand = predictions["ycand"].reshape(-1,7)
+
+    # make confusion matrix for MLPF
+    conf_matrix_mlpf = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
+                                    pred_ids.cpu(), labels=range(6), normalize="true")
+
+    plotting.plot_confusion_matrix(conf_matrix_mlpf, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_mlpf' + str(epoch), epoch=epoch)
+    torch.save(conf_matrix_mlpf, outpath + '/conf_matrix_mlpf' + str(epoch) + '.pt')
+
+    # make confusion matrix for rule based PF
+    conf_matrix_cand = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
+                                    cand_ids.cpu(), labels=range(6), normalize="true")
+
+    plotting.plot_confusion_matrix(conf_matrix_cand, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_cand' + str(epoch), epoch=epoch)
+    torch.save(conf_matrix_cand, outpath + '/conf_matrix_cand' + str(epoch) + '.pt')
+
+    # making all the other plots
+    if 'test' in which_data:
+        sample = "QCD, 14 TeV, PU200"
+    else:
+        sample = "$t\\bar{t}$, 14 TeV, PU200"
+
+    # make distribution plots
+    plot_distributions_pid(1, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for chhadrons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(2, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for nhadrons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(3, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for photons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(4, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for electrons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(5, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for muons
+                target, epoch, outpath, legend_title=sample+"\n")
+
+    plot_distributions_all(gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for all together
+                target, epoch, outpath, legend_title=sample+"\n")
+
+    # make pt, eta plots to visualize dataset
+    ax, _ = plot_pt_eta(ygen)
+    plt.savefig(outpath+"/gen_pt_eta.png", bbox_inches="tight")
+
+    # plot particle multiplicity plots
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_null = plot_num_particles_pid(list_for_multiplicities, "null", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_null.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_chhad = plot_num_particles_pid(list_for_multiplicities, "chhadron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_chhadron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_nhad = plot_num_particles_pid(list_for_multiplicities, "nhadron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_nhadron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_photon = plot_num_particles_pid(list_for_multiplicities, "photon", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_photon.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_electron = plot_num_particles_pid(list_for_multiplicities, "electron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_electron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_muon = plot_num_particles_pid(list_for_multiplicities, "muon", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_muon.png", bbox_inches="tight")
+    plt.close(fig)
+
+    # make efficiency and fake rate plots for charged hadrons
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_pt.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_eta.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid1_energy.png", both=True, legend_title=sample+"\n")
+
+    # make efficiency and fake rate plots for neutral hadrons
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_pt.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_eta.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid2_energy.png", both=True, legend_title=sample+"\n")
+
+    # make resolution plots for chhadrons: pid=1
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_pt = plot_reso(ygen, ypred, ycand, 1, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_eta = plot_reso(ygen, ypred, ycand, 1, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_E = plot_reso(ygen, ypred, ycand, 1, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for nhadrons: pid=2
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_pt = plot_reso(ygen, ypred, ycand, 2, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_eta = plot_reso(ygen, ypred, ycand, 2, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_E = plotting.plot_reso(ygen, ypred, ycand, 2, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for photons: pid=3
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_pt = plot_reso(ygen, ypred, ycand, 3, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_eta = plot_reso(ygen, ypred, ycand, 3, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_E = plot_reso(ygen, ypred, ycand, 3, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for electrons: pid=4
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_pt = plot_reso(ygen, ypred, ycand, 4, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_eta = plot_reso(ygen, ypred, ycand, 4, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_E = plot_reso(ygen, ypred, ycand, 4, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for muons: pid=5
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_pt = plot_reso(ygen, ypred, ycand, 5, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_eta = plot_reso(ygen, ypred, ycand, 5, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_E = plot_reso(ygen, ypred, ycand, 5, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    t1=time.time()
+    print('Time taken to make plots is:', round(((t1-t0)/60),2), 'min')
diff --git a/mlpf/plotting/plots_delphes.py b/mlpf/plotting/plots_delphes.py
old mode 100755
new mode 100644
diff --git a/mlpf/pytorch/README.md b/mlpf/pytorch/README.md
deleted file mode 100644
index 01cd33211..000000000
--- a/mlpf/pytorch/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Short instructions to train on cms data:
-```bash
-cd ../..
-./scripts/local_test_cms.sh
-```
-
-Short instructions to train on delphes data:
-```bash
-cd ../..
-./scripts/local_test_delphes.sh
-```
-
-### Delphes dataset
-The dataset is available from zenodo: https://doi.org/10.5281/zenodo.4452283.
diff --git a/mlpf/pytorch/data_preprocessing.py b/mlpf/pytorch/data_preprocessing.py
deleted file mode 100755
index 2a3c3d1a8..000000000
--- a/mlpf/pytorch/data_preprocessing.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import numpy as np
-import torch
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-# define a function that casts the dataset into a dataloader for efficient NN training
-def from_data_to_loader(full_dataset, n_train, n_val, batch_size):
-
-    train_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_train))
-    valid_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=n_train, stop=n_train+n_val))
-
-    # preprocessing the train_dataset in a good format for passing correct batches of events to the GNN
-    train_dataset_batched=[]
-    for i in range(len(train_dataset)):
-        train_dataset_batched += train_dataset[i]
-    train_dataset_batched = [[i] for i in train_dataset_batched]
-
-    # preprocessing the valid_dataset in a good format for passing correct batches of events to the GNN
-    valid_dataset_batched=[]
-    for i in range(len(valid_dataset)):
-        valid_dataset_batched += valid_dataset[i]
-    valid_dataset_batched = [[i] for i in valid_dataset_batched]
-
-    #hack for multi-gpu training
-    if not multi_gpu:
-        def collate(items):
-            l = sum(items, [])
-            return Batch.from_data_list(l)
-    else:
-        def collate(items):
-            l = sum(items, [])
-            return l
-
-    train_loader = DataListLoader(train_dataset_batched, batch_size, pin_memory=True, shuffle=True)
-    train_loader.collate_fn = collate
-    valid_loader = DataListLoader(valid_dataset_batched, batch_size, pin_memory=True, shuffle=False)
-    valid_loader.collate_fn = collate
-
-    return train_loader, valid_loader
diff --git a/mlpf/pytorch/eval_end2end_delphes.py b/mlpf/pytorch/eval_end2end_delphes.py
deleted file mode 100755
index 2d3dc4ecb..000000000
--- a/mlpf/pytorch/eval_end2end_delphes.py
+++ /dev/null
@@ -1,183 +0,0 @@
-#import setGPU
-import torch
-import torch_geometric
-import sklearn
-import numpy as np
-import matplotlib.pyplot as plt
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-import pandas
-import mplhep
-import pickle
-
-import graph_data_delphes
-from graph_data_delphes import PFGraphDataset
-from data_preprocessing import from_data_to_loader
-import train_end2end_delphes
-import time
-import math
-
-import sys
-sys.path.insert(1, '../plotting/')
-sys.path.insert(1, '../mlpf/plotting/')
-
-import plots_delphes
-from plots_delphes import make_plots
-
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-def collate(items):
-    l = sum(items, [])
-    return Batch.from_data_list(l)
-
-def prepare_test_data(full_dataset, start, stop, batch_size):
-
-    test_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=start, stop=stop))
-
-    # preprocessing the test_dataset in a good format for passing correct batches of events to the GNN
-    test_dataset_batched=[]
-    for i in range(len(test_dataset)):
-        test_dataset_batched += test_dataset[i]
-    test_dataset_batched = [[i] for i in test_dataset_batched]
-
-    #hack for multi-gpu training
-    if not multi_gpu:
-        def collate(items):
-            l = sum(items, [])
-            return Batch.from_data_list(l)
-    else:
-        def collate(items):
-            l = sum(items, [])
-            return l
-
-    test_loader = DataListLoader(test_dataset_batched, batch_size, pin_memory=True, shuffle=True)
-    test_loader.collate_fn = collate
-
-    return test_loader
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, choices=sorted(train_end2end_delphes.model_classes.keys()), help="type of model to use", default="PFNet6")
-    parser.add_argument("--path", type=str, help="path to model", default="data/PFNet7_TTbar_14TeV_TuneCUETP8M1_cfi_gen__npar_221073__cfg_ee19d91068__user_jovyan__ntrain_400__lr_0.0001__1588215695")
-    parser.add_argument("--epoch", type=str, default=0, help="Epoch to use")
-    parser.add_argument("--dataset", type=str, help="Input dataset", required=True)
-    parser.add_argument("--start", type=int, default=3800, help="first file index to evaluate")
-    parser.add_argument("--stop", type=int, default=4000, help="last file index to evaluate")
-    parser.add_argument("--target", type=str, choices=["cand", "gen"], default="cand", help="type of data the model trained on (cand or gen)")
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    device = torch.device("cpu")
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'model': 'PFNet7', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'epoch' : 1, 'target': 'cand', 'start':1, 'stop':2,
-    # 'path': '../../test_tmp_delphes/experiments/PFNet7_pythia8_ttbar_gen__npar_41414__cfg_fca529f313__user_fmokhtar__ntrain_1__lr_0.0001__1611654293'})
-
-    epoch = args.epoch
-    model = args.model
-    path = args.path
-    weights = torch.load("{}/epoch_{}_weights.pth".format(path, epoch), map_location=device)
-    weights = {k.replace("module.", ""): v for k, v in weights.items()}
-
-    with open('{}/model_kwargs.pkl'.format(path),'rb') as f:
-        model_kwargs = pickle.load(f)
-
-    model_class = train_end2end_delphes.model_classes[args.model]
-    model = model_class(**model_kwargs)
-    model.load_state_dict(weights)
-    model = model.to(device)
-    model.eval()
-
-    # prepare some test_data
-    print('Creating the test data and feeding it to the model..')
-    full_dataset = PFGraphDataset(root=args.dataset)
-    loader = prepare_test_data(full_dataset, start=args.start, stop=args.stop, batch_size=10)
-
-    # TODO: here we only evaluate a forward pass of only one batch of the allocated test data
-    for batch in loader:
-        pred_id, pred_p4, new_edges_ = model(batch)
-        break
-
-    print('Making plots for evaluation..')
-
-    if args.target=='cand':
-        make_plots(batch.ycand_id, batch.ycand, pred_id, pred_p4, out=path +'/')
-    elif args.target=='gen':
-        make_plots(batch.ygen_id, batch.ygen, pred_id, pred_p4, out=path +'/')
-
-
-# def prepare_dataframe(model, loader, multi_gpu, device, target_type="cand"):
-#     model.eval()
-#     dfs = []
-#     dfs_edges = []
-#     eval_time = 0
-#
-#     for i, data in enumerate(loader):
-#         if not multi_gpu:
-#             data = data.to(device)
-#         pred_id_onehot, pred_momentum, new_edges = model(data)
-#         _, pred_id = torch.max(pred_id_onehot, -1)
-#         pred_momentum[pred_id==0] = 0
-#         data = [data]
-#
-#         x = torch.cat([d.x.to("cpu") for d in data])
-#         gen_id = torch.cat([d.ygen_id.to("cpu") for d in data])
-#         gen_p4 = torch.cat([d.ygen[:, :].to("cpu") for d in data])
-#         cand_id = torch.cat([d.ycand_id.to("cpu") for d in data])
-#         cand_p4 = torch.cat([d.ycand[:, :].to("cpu") for d in data])
-#
-#         # reverting the one_hot_embedding
-#         gen_id_flat = torch.max(gen_id, -1)[1]
-#         cand_id_flat = torch.max(cand_id, -1)[1]
-#
-#         df = pandas.DataFrame()
-#         gen_p4.shape
-#         gen_id.shape
-#
-#         # Recall:
-#         # [pid] takes from 1 to 6
-#         # [charge, pt (GeV), eta, sin phi, cos phi, E (GeV)]
-#
-#         df["elem_type"] = [int(elem_labels[i]) for i in torch.argmax(x[:, :len(elem_labels)], axis=-1).numpy()]
-#
-#         if target_type == "gen":
-#             df["gen_pid"] = [int(class_labels[i]) for i in gen_id_flat.numpy()]
-#             df["gen_charge"] = gen_p4[:, 0].numpy()
-#             df["gen_eta"] = gen_p4[:, 2].numpy()
-#             df["gen_sphi"] = gen_p4[:, 3].numpy()
-#             df["gen_cphi"] = gen_p4[:, 4].numpy()
-#             df["gen_e"] = gen_p4[:, 5].numpy()
-#
-#         elif target_type == "cand":
-#             df["cand_pid"] = [int(class_labels[i]) for i in cand_id_flat.numpy()]
-#             df["cand_charge"] = cand_p4[:, 0].numpy()
-#             df["cand_eta"] = cand_p4[:, 2].numpy()
-#             df["cand_sphi"] = cand_p4[:, 3].numpy()
-#             df["cand_cphi"] = cand_p4[:, 4].numpy()
-#             df["cand_e"] = cand_p4[:, 5].numpy()
-#
-#         df["pred_pid"] = [int(class_labels[i]) for i in pred_id.detach().cpu().numpy()]
-#         df["pred_charge"] = pred_momentum[:, 0].detach().cpu().numpy()
-#         df["pred_eta"] = pred_momentum[:, 2].detach().cpu().numpy()
-#         df["pred_sphi"] = pred_momentum[:, 3].detach().cpu().numpy()
-#         df["pred_cphi"] = pred_momentum[:, 4].detach().cpu().numpy()
-#         df["pred_e"] = pred_momentum[:, 5].detach().cpu().numpy()
-#
-#         dfs.append(df)
-#
-#     df = pandas.concat(dfs, ignore_index=True)
-#     return df
diff --git a/mlpf/pytorch/model.py b/mlpf/pytorch/model.py
deleted file mode 100755
index 1d4c6b242..000000000
--- a/mlpf/pytorch/model.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import numpy as np
-import mplhep
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.data import Data, DataListLoader, Batch
-from torch.utils.data import random_split
-
-from gravnet import GravNetConv
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=32, encoding_dim=256,
-        output_dim_id=6,
-        output_dim_p4=6,
-        convlayer="gravnet-radius",
-        convlayer2="none",
-        space_dim=2, nearest=3, dropout_rate=0.0, activation="leaky_relu", return_edges=False, radius=0.1, input_encoding=0):
-
-        super(PFNet7, self).__init__()
-
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        self.return_edges = return_edges
-        self.convlayer = convlayer
-        self.input_encoding = input_encoding
-
-        self.act = nn.LeakyReLU
-        self.act_f = torch.nn.functional.leaky_relu
-
-        # (1) GNN layer
-        if convlayer == "gravnet-knn":
-            self.conv1 = GravNetConv(input_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="knn")
-        elif convlayer == "gravnet-radius":
-            self.conv1 = GravNetConv(input_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="radius", radius=radius)
-        else:
-            raise Exception("Unknown convolution layer: {}".format(convlayer))
-
-        # (2) another GNN layer if you want
-        self.convlayer2 = convlayer2
-        if convlayer2 == "none":
-            self.conv2_1 = None
-            self.conv2_2 = None
-
-        # (3) dropout layer if you want
-        self.dropout1 = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-
-        # (4) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(encoding_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (5) DNN layer: regressing p4
-        self.nn3 = nn.Sequential(
-            nn.Linear(encoding_dim + output_dim_id, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_p4),
-        )
-
-    def forward(self, data):
-
-        #encode the inputs (x is of shape [~5000*batch_size, input_dim])
-        x = data.x
-
-        #Run a clustering of the inputs that returns the new_edge_index.. this is the KNN step..
-        # new_edge_index is of shape [2, big#]
-        # x & x1 are of shape [~5000*batch_size, encoding_dim]
-        new_edge_index, x = self.conv1(x)
-        x1 = self.act_f(x)                 # act by nonlinearity
-
-        #Decode convolved graph nodes to PID (after a dropout)
-        # cand_ids is of shape [~5000*batch_size, 6]
-        cand_ids = self.nn2(self.dropout1(x1))
-
-        #Decode convolved graph nodes to p4
-        # (1) add the predicted PID along as it may help (why we concatenate)
-        nn3_input = torch.cat([x1, cand_ids], axis=-1)
-        # (2) pass them both to the NN
-        cand_p4 = self.nn3(self.dropout1(nn3_input))
-
-        return cand_ids, cand_p4, new_edge_index
-
-
-# -------------------------------------------------------------------------------------
-# # test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import from_data_to_loader
-#
-# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/delphes_cfi')
-#
-# train_loader, valid_loader = from_data_to_loader(full_dataset, n_train=2, n_val=1, batch_size=1 )
-#
-# print(next(iter(train_loader)))
-#
-# model = PFNet7()
-#
-# for batch in train_loader:
-#     cand_id_onehot, cand_momentum, new_edge_index = model(batch)
-#     break
-#
-# batch
-# print(cand_id_onehot.shape)
-# print(cand_momentum.shape)
-# print(new_edge_index.shape)
-# print(new_edge_index)
diff --git a/mlpf/pytorch/model_general.py b/mlpf/pytorch/model_general.py
deleted file mode 100755
index 6f25f25ce..000000000
--- a/mlpf/pytorch/model_general.py
+++ /dev/null
@@ -1,177 +0,0 @@
-import sys
-import os
-import math
-
-from comet_ml import Experiment
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from gravnet import GravNetConv
-from torch_geometric.data import Data, DataListLoader, Batch
-from torch.utils.data import random_split
-
-import torch_cluster
-
-from glob import glob
-import numpy as np
-import os.path as osp
-import pickle
-
-import math
-import time
-import numba
-import tqdm
-import sklearn
-import pandas
-
-import mplhep
-
-from sklearn.metrics import accuracy_score
-
-import graph_data
-from graph_data import PFGraphDataset, elem_to_id, class_to_id, class_labels
-from sklearn.metrics import confusion_matrix
-
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=32, encoding_dim=256,
-        output_dim_id=6,
-        output_dim_p4=6,
-        convlayer="gravnet-radius",
-        convlayer2="none",
-        space_dim=2, nearest=3, dropout_rate=0.0, activation="leaky_relu", return_edges=False, radius=0.1, input_encoding=0):
-
-        super(PFNet7, self).__init__()
-
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        self.return_edges = return_edges
-        self.convlayer = convlayer
-        self.input_encoding = input_encoding
-
-        if activation == "leaky_relu":
-            self.act = nn.LeakyReLU
-            self.act_f = torch.nn.functional.leaky_relu
-        elif activation == "selu":
-            self.act = nn.SELU
-            self.act_f = torch.nn.functional.selu
-        elif activation == "relu":
-            self.act = nn.ReLU
-            self.act_f = torch.nn.functional.relu
-
-        # if you want to add an initial encoding of the input
-        conv_in_dim = input_dim
-        if self.input_encoding>0:
-            self.nn1 = nn.Sequential(
-                nn.Linear(input_dim, hidden_dim),
-                self.act(),
-                nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-                nn.Linear(hidden_dim, hidden_dim),
-                self.act(),
-                nn.Linear(hidden_dim, encoding_dim),
-            )
-            conv_in_dim = encoding_dim
-
-        # (1) GNN layer
-        if convlayer == "gravnet-knn":
-            self.conv1 = GravNetConv(conv_in_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="knn")
-        elif convlayer == "gravnet-radius":
-            self.conv1 = GravNetConv(conv_in_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="radius", radius=radius)
-        else:
-            raise Exception("Unknown convolution layer: {}".format(convlayer))
-
-        #decoding layer receives the raw inputs and the gravnet output
-        num_decode_in = input_dim + encoding_dim
-
-        # (2) another GNN layer if you want
-        self.convlayer2 = convlayer2
-        if convlayer2 == "none":
-            self.conv2_1 = None
-            self.conv2_2 = None
-        elif convlayer2 == "sgconv":
-            self.conv2_1 = SGConv(num_decode_in, hidden_dim, K=1)
-            self.conv2_2 = SGConv(num_decode_in, hidden_dim, K=1)
-            num_decode_in += hidden_dim
-        elif convlayer2 == "graphunet":
-            self.conv2_1 = GraphUNet(num_decode_in, hidden_dim, hidden_dim, 2, pool_ratios=0.1)
-            self.conv2_2 = GraphUNet(num_decode_in, hidden_dim, hidden_dim, 2, pool_ratios=0.1)
-            num_decode_in += hidden_dim
-        elif convlayer2 == "gatconv":
-            self.conv2_1 = GATConv(num_decode_in, hidden_dim, 4, concat=False, dropout=dropout_rate)
-            self.conv2_2 = GATConv(num_decode_in, hidden_dim, 4, concat=False, dropout=dropout_rate)
-            num_decode_in += hidden_dim
-        else:
-            raise Exception("Unknown convolution layer: {}".format(convlayer2))
-
-        # (3) dropout layer if you want
-        self.dropout1 = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-
-        # (4) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(num_decode_in, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (5) DNN layer: regressing p4
-        self.nn3 = nn.Sequential(
-            nn.Linear(num_decode_in + output_dim_id, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_p4),
-        )
-
-    def forward(self, data):
-
-        #encode the inputs
-        x = data.x
-
-        if self.input_encoding:
-            x = self.nn1(x)
-
-        #Run a clustering of the inputs that returns the new_edge_index.. this is the KNN step..
-        new_edge_index, x = self.conv1(x)
-        x1 = self.act_f(x)
-
-        #run a second convolution
-        if self.convlayer2 != "none":
-            conv2_input = torch.cat([data.x, x1], axis=-1)
-            x2_1 = self.act_f(self.conv2_1(conv2_input, new_edge_index))
-            x2_2 = self.act_f(self.conv2_2(conv2_input, new_edge_index))
-            nn2_input = torch.cat([data.x, x1, x2_1], axis=-1)
-        else:
-            nn2_input = torch.cat([data.x, x1], axis=-1)
-
-        #Decode convolved graph nodes to pdgid and p4
-        cand_ids = self.nn2(self.dropout1(nn2_input))
-
-        if self.convlayer2 != "none":
-            nn3_input = torch.cat([data.x, x1, x2_2, cand_ids], axis=-1)
-        else:
-            nn3_input = torch.cat([data.x, x1, cand_ids], axis=-1)
-
-        cand_p4 = data.x[:, len(elem_to_id):len(elem_to_id)+4] + self.nn3(self.dropout1(nn3_input))
-        return cand_ids, cand_p4, new_edge_index
diff --git a/mlpf/pytorch/train_end2end_delphes.py b/mlpf/pytorch/train_end2end_delphes.py
deleted file mode 100755
index 706373d72..000000000
--- a/mlpf/pytorch/train_end2end_delphes.py
+++ /dev/null
@@ -1,437 +0,0 @@
-import sys
-import os
-
-from comet_ml import Experiment
-
-#Check if the GPU configuration has been provided
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from gravnet import GravNetConv
-from torch_geometric.data import Data, DataListLoader, Batch
-from torch.utils.data import random_split
-
-import torch_cluster
-
-from glob import glob
-import numpy as np
-import os.path as osp
-import pickle
-import math
-import time
-import tqdm
-import sklearn
-import pandas
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from sklearn.metrics import accuracy_score
-from sklearn.metrics import confusion_matrix
-
-from model import PFNet7
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import from_data_to_loader
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, lr, target_type):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_{}__npar_{}__cfg_{}__user_{}__ntrain_{}__lr_{}__{}'.format(
-        model_name,
-        dataset.split("/")[-1],
-        target_type,
-        model_params,
-        model_cfghash,
-        model_user,
-        n_train,
-        lr, int(time.time()))
-    return model_fname
-
-model_classes = {
-    "PFNet7": PFNet7,
-}
-
-def mse_loss(input, target):
-    return torch.sum((input - target) ** 2)
-
-def weighted_mse_loss(input, target, weight):
-    return torch.sum(weight * (input - target).sum(axis=1) ** 2)
-
-def compute_weights(target_ids, device):
-    vs, cs = torch.unique(target_ids, return_counts=True)
-    weights = torch.zeros(output_dim_id).to(device=device)
-    for k, v in zip(vs, cs):
-        weights[k] = 1.0/math.sqrt(float(v))
-    return weights
-
-@torch.no_grad()
-def test(model, loader, epoch, l1m, l2m, l3m, target_type):
-    with torch.no_grad():
-        ret = train(model, loader, epoch, None, l1m, l2m, l3m, target_type, None)
-    return ret
-
-
-def train(model, loader, epoch, optimizer, l1m, l2m, l3m, target_type, scheduler):
-
-    is_train = not (optimizer is None)
-
-    if is_train:
-        model.train()
-    else:
-        model.eval()
-
-    #loss values for each batch: classification, regression
-    losses = np.zeros((len(loader), 3))
-
-    #accuracy values for each batch (monitor classification performance)
-    accuracies_batch = np.zeros(len(loader))
-
-    #correlation values for each batch (monitor regression performance)
-    corrs_batch = np.zeros(len(loader))
-
-    #epoch confusion matrix
-    conf_matrix = np.zeros((output_dim_id, output_dim_id))
-
-    #keep track of how many data points were processed
-    num_samples = 0
-
-    for i, batch in enumerate(loader):
-        t0 = time.time()
-
-        if not multi_gpu:
-            batch = batch.to(device)
-
-        if is_train:
-            optimizer.zero_grad()
-
-        # forward pass
-        cand_id_onehot, cand_momentum, new_edge_index = model(batch)
-
-        _dev = cand_id_onehot.device                   # store the device in dev
-        _, indices = torch.max(cand_id_onehot, -1)     # picks the maximum PID location and stores the index (opposite of one_hot_embedding)
-
-        num_samples += len(cand_id_onehot)
-
-        # concatenate ygen/ycand over the batch to compare with the truth label
-        # now: ygen/ycand is of shape [~5000*batch_size, 6] corresponding to the output of the forward pass
-        if args.target == "gen":
-            target_ids = batch.ygen_id
-            target_p4 = batch.ygen
-        elif args.target == "cand":
-            target_ids = batch.ycand_id
-            target_p4 = batch.ycand
-
-        #Predictions where both the predicted and true class label was nonzero
-        #In these cases, the true candidate existed and a candidate was predicted
-        # target_ids_msk reverts the one_hot_embedding
-        # msk is a list of booleans of shape [~5000*batch_size] where each boolean correspond to whether a candidate was predicted
-        _, target_ids_msk = torch.max(target_ids, -1)
-        msk = ((indices != 0) & (target_ids_msk != 0)).detach().cpu()
-        msk2 = ((indices != 0) & (indices == target_ids_msk))
-
-        accuracies_batch[i] = accuracy_score(target_ids_msk[msk].detach().cpu().numpy(), indices[msk].detach().cpu().numpy())
-
-        # a manual rescaling weight given to each class
-        weights = compute_weights(torch.max(target_ids,-1)[1], _dev)
-
-        #Loss for output candidate id (multiclass)
-        l1 = l1m * torch.nn.functional.cross_entropy(target_ids, indices, weight=weights)
-
-        #Loss for candidate p4 properties (regression)
-        l2 = l2m * torch.nn.functional.mse_loss(target_p4[msk2], cand_momentum[msk2])
-
-        batch_loss = l1 + l2
-        losses[i, 0] = l1.item()
-        losses[i, 1] = l2.item()
-
-        if is_train:
-            batch_loss.backward()
-
-        batch_loss_item = batch_loss.item()
-        t1 = time.time()
-
-        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), batch_loss_item, t1-t0), end='\r', flush=True)
-        if is_train:
-            optimizer.step()
-            if not scheduler is None:
-                scheduler.step()
-
-        #Compute correlation of predicted and true pt values for monitoring
-        corr_pt = 0.0
-        if msk.sum()>0:
-            corr_pt = np.corrcoef(
-                cand_momentum[msk, 0].detach().cpu().numpy(),
-                target_p4[msk, 0].detach().cpu().numpy())[0,1]
-
-        corrs_batch[i] = corr_pt
-
-        conf_matrix += confusion_matrix(target_ids_msk.detach().cpu().numpy(),
-                                        np.argmax(cand_id_onehot.detach().cpu().numpy(),axis=1), labels=range(6))
-
-    corr = np.mean(corrs_batch)
-    acc = np.mean(accuracies_batch)
-    losses = losses.mean(axis=0)
-    return num_samples, losses, corr, acc, conf_matrix
-
-def train_loop():
-    t0_initial = time.time()
-
-    losses_train = np.zeros((args.n_epochs, 3))
-    losses_val = np.zeros((args.n_epochs, 3))
-
-    corrs = []
-    corrs_v = []
-    accuracies = []
-    accuracies_v = []
-    best_val_loss = 99999.9
-    stale_epochs = 0
-
-    print("Training over {} epochs".format(args.n_epochs))
-    for epoch in range(args.n_epochs):
-        t0 = time.time()
-
-        if stale_epochs > patience:
-            print("breaking due to stale epochs")
-            break
-
-        with experiment.train():
-            model.train()
-
-            num_samples_train, losses, c, acc, conf_matrix = train(model, train_loader, epoch, optimizer,
-                                                                   args.l1, args.l2, args.l3, args.target, scheduler)
-
-            experiment.log_metric('lr', optimizer.param_groups[0]['lr'], step=epoch)
-            l = sum(losses)
-            losses_train[epoch] = losses
-            corrs += [c]
-            accuracies += [acc]
-            experiment.log_metric('loss',l, step=epoch)
-            experiment.log_metric('loss1',losses[0], step=epoch)
-            experiment.log_metric('loss2',losses[1], step=epoch)
-            experiment.log_metric('loss3',losses[2], step=epoch)
-            experiment.log_metric('corrs',c, step=epoch)
-            experiment.log_metric('accuracy',acc, step=epoch)
-            experiment.log_confusion_matrix(matrix=conf_matrix, step=epoch,
-                                            title='Confusion Matrix Full',
-                                            file_name='confusion-matrix-full-train-%03d.json' % epoch,
-                                            labels = [str(c) for c in range(output_dim_id)])
-
-        with experiment.validate():
-            model.eval()
-            num_samples_val, losses_v, c_v, acc_v, conf_matrix_v = test(model, valid_loader, epoch,
-                                                                        args.l1, args.l2, args.l3, args.target)
-            l_v = sum(losses_v)
-            losses_val[epoch] = losses_v
-            corrs_v += [c_v]
-            accuracies_v += [acc_v]
-            experiment.log_metric('loss',l_v, step=epoch)
-            experiment.log_metric('loss1',losses_v[0], step=epoch)
-            experiment.log_metric('loss2',losses_v[1], step=epoch)
-            experiment.log_metric('loss3',losses_v[2], step=epoch)
-            experiment.log_metric('corrs',c_v, step=epoch)
-            experiment.log_metric('accuracy',acc_v, step=epoch)
-            experiment.log_confusion_matrix(matrix=conf_matrix_v, step=epoch,
-                                            title='Confusion Matrix Full',
-                                            file_name='confusion-matrix-full-val-%03d.json' % epoch,
-                                            labels = [str(c) for c in range(output_dim_id)])
-
-        if l_v < best_val_loss:
-            best_val_loss = l_v
-            stale_epochs = 0
-        else:
-            stale_epochs += 1
-
-        t1 = time.time()
-        epochs_remaining = args.n_epochs - epoch
-        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
-        experiment.log_metric('time_per_epoch', time_per_epoch, step=epoch)
-        eta = epochs_remaining*time_per_epoch/60
-
-        spd = (num_samples_val+num_samples_train)/time_per_epoch
-        losses_str = "[" + ",".join(["{:.4f}".format(x) for x in losses_v]) + "]"
-
-        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
-
-        print("epoch={}/{} dt={:.2f}s loss_train={:.5f} loss_valid={:.5f} c={:.2f}/{:.2f} a={:.6f}/{:.6f} partial_losses={} stale={} eta={:.1f}m spd={:.2f} samples/s lr={}".format(
-            epoch+1, args.n_epochs,
-            t1 - t0, l, l_v, c, c_v, acc, acc_v,
-            losses_str, stale_epochs, eta, spd, optimizer.param_groups[0]['lr']))
-
-    print('Done with training.')
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_train", type=int, default=2, help="number of data files to use for training.. each file contains 100 events")
-    parser.add_argument("--n_val", type=int, default=1, help="number of data files to use for validation.. each file contains 100 events")
-    parser.add_argument("--n_epochs", type=int, default=100, help="number of training epochs")
-    parser.add_argument("--patience", type=int, default=100, help="patience before early stopping")
-    parser.add_argument("--hidden_dim", type=int, default=32, help="hidden dimension")
-    parser.add_argument("--encoding_dim", type=int, default=256, help="encoded element dimension")
-    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
-    parser.add_argument("--model", type=str, choices=sorted(model_classes.keys()), help="type of model to use", default="PFNet6")
-    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="cand")
-    parser.add_argument("--dataset", type=str, help="Input dataset", required=True)
-    parser.add_argument("--outpath", type=str, default = 'experiments/', help="Output folder")
-    parser.add_argument("--activation", type=str, default='leaky_relu', choices=["selu", "leaky_relu", "relu"], help="activation function")
-    parser.add_argument("--optimizer", type=str, default='adam', choices=["adam", "adamw"], help="optimizer to use")
-    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
-    parser.add_argument("--l1", type=float, default=1.0, help="Loss multiplier for pdg-id classification")
-    parser.add_argument("--l2", type=float, default=0.001, help="Loss multiplier for momentum regression")
-    parser.add_argument("--l3", type=float, default=1.0, help="Loss multiplier for clustering")
-    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout rate")
-    parser.add_argument("--radius", type=float, default=0.1, help="Radius-graph radius")
-    parser.add_argument("--convlayer", type=str, choices=["gravnet-knn", "gravnet-radius", "sgconv", "gatconv"], help="Convolutional layer", default="gravnet")
-    parser.add_argument("--convlayer2", type=str, choices=["sgconv", "graphunet", "gatconv", "none"], help="Convolutional layer", default="none")
-    parser.add_argument("--space_dim", type=int, default=2, help="Spatial dimension for clustering in gravnet layer")
-    parser.add_argument("--nearest", type=int, default=3, help="k nearest neighbors in gravnet layer")
-    parser.add_argument("--overwrite", action='store_true', help="overwrite if model output exists")
-    parser.add_argument("--disable_comet", action='store_true', help="disable comet-ml")
-    parser.add_argument("--input_encoding", type=int, help="use an input encoding layer", default=0)
-    parser.add_argument("--load", type=str, help="Load the weight file", required=False, default=None)
-    parser.add_argument("--scheduler", type=str, help="LR scheduler", required=False, default="none", choices=["none", "onecycle"])
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'n_train': 2, 'n_val': 1, 'n_epochs': 3, 'patience': 100, 'hidden_dim':32, 'encoding_dim': 256,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'cand', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar',
-    # 'outpath': 'experiments/', 'activation': 'leaky_relu', 'optimizer': 'adam', 'lr': 1e-4, 'l1': 1, 'l2': 0.001, 'l3': 1, 'dropout': 0.5,
-    # 'radius': 0.1, 'convlayer': 'gravnet-radius', 'convlayer2': 'none', 'space_dim': 2, 'nearest': 3, 'overwrite': True,
-    # 'disable_comet': True, 'input_encoding': 0, 'load': None, 'scheduler': 'none'})
-
-    # define the dataset
-    full_dataset = PFGraphDataset(args.dataset)
-
-    # constructs a loader from the data to iterate over batches
-    train_loader, valid_loader = from_data_to_loader(full_dataset, args.n_train, args.n_val, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'dropout_rate': args.dropout,
-                    'convlayer': args.convlayer,
-                    'convlayer2': args.convlayer2,
-                    'radius': args.radius,
-                    'space_dim': args.space_dim,
-                    'activation': args.activation,
-                    'nearest': args.nearest,
-                    'input_encoding': args.input_encoding}
-
-    #instantiate the model
-    model = model_class(**model_kwargs)
-    if args.load:
-        s1 = torch.load(args.load, map_location=torch.device('cpu'))
-        s2 = {k.replace("module.", ""): v for k, v in s1.items()}
-        model.load_state_dict(s2)
-
-    if multi_gpu:
-        model = torch_geometric.nn.DataParallel(model)
-
-    model.to(device)
-
-    model_fname = get_model_fname(args.dataset, model, args.n_train, args.lr, args.target)
-
-    # need your api key in a .comet.config file: see https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables
-    experiment = Experiment(project_name="particleflow", disabled=args.disable_comet)
-    experiment.set_model_graph(repr(model))
-    experiment.log_parameters(dict(model_kwargs, **{'model': args.model, 'lr':args.lr, 'model_fname': model_fname,
-                                                    'l1': args.l1, 'l2':args.l2,
-                                                    'n_train':args.n_train, 'target':args.target, 'optimizer': args.optimizer}))
-    outpath = osp.join(args.outpath, model_fname)
-    if osp.isdir(outpath):
-        if args.overwrite:
-            print("model output {} already exists, deleting it".format(outpath))
-            import shutil
-            shutil.rmtree(outpath)
-        else:
-            print("model output {} already exists, please delete it".format(outpath))
-            sys.exit(0)
-    try:
-        os.makedirs(outpath)
-    except Exception as e:
-        pass
-
-    with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
-        pickle.dump(model_kwargs, f,  protocol=pickle.HIGHEST_PROTOCOL)
-
-    if args.optimizer == "adam":
-        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-    elif args.optimizer == "adamw":
-        optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-
-    scheduler = None
-    if args.scheduler == "onecycle":
-        scheduler = torch.optim.lr_scheduler.OneCycleLR(
-            optimizer,
-            max_lr=args.lr,
-            steps_per_epoch=int(len(train_loader)),
-            epochs=args.n_epochs + 1,
-            anneal_strategy='linear',
-        )
-
-    print(model)
-    print(model_fname)
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    print("params", params)
-
-    model.train()
-
-    train_loop()
-    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
-    #     train_loop()
-
-    # print(prof.key_averages().table(sort_by="cuda_time_total"))
diff --git a/mlpf/pytorch_cms/README.md b/mlpf/pytorch_cms/README.md
new file mode 100644
index 000000000..19ab76c6e
--- /dev/null
+++ b/mlpf/pytorch_cms/README.md
@@ -0,0 +1,5 @@
+Short instructions to train on cms data:
+```bash
+cd ../..
+./scripts/local_test_cms.sh
+```
diff --git a/mlpf/pytorch/eval_end2end_cms.py b/mlpf/pytorch_cms/eval_end2end_cms.py
similarity index 100%
rename from mlpf/pytorch/eval_end2end_cms.py
rename to mlpf/pytorch_cms/eval_end2end_cms.py
diff --git a/mlpf/pytorch/graph_data_cms.py b/mlpf/pytorch_cms/graph_data_cms.py
similarity index 100%
rename from mlpf/pytorch/graph_data_cms.py
rename to mlpf/pytorch_cms/graph_data_cms.py
diff --git a/mlpf/pytorch/gravnet.py b/mlpf/pytorch_cms/gravnet.py
similarity index 100%
rename from mlpf/pytorch/gravnet.py
rename to mlpf/pytorch_cms/gravnet.py
diff --git a/mlpf/pytorch/train_end2end_cms.py b/mlpf/pytorch_cms/train_end2end_cms.py
old mode 100755
new mode 100644
similarity index 100%
rename from mlpf/pytorch/train_end2end_cms.py
rename to mlpf/pytorch_cms/train_end2end_cms.py
diff --git a/mlpf/pytorch_delphes/README.md b/mlpf/pytorch_delphes/README.md
new file mode 100644
index 000000000..3a25d6f4b
--- /dev/null
+++ b/mlpf/pytorch_delphes/README.md
@@ -0,0 +1,23 @@
+Short instructions to do a quick training on delphes data:
+```bash
+cd ../..
+./scripts/local_test_delphes_pytorch.sh
+```
+
+### Delphes dataset
+The dataset is available from zenodo: https://doi.org/10.5281/zenodo.4452283.
+
+Instructions to download and process the full Delphes dataset:
+```bash
+cd ../../scripts/
+./get_all_data_delphes.sh
+```
+
+This script will download and process the data under a directory called "test_tmp_delphes/" in particleflow. There are will be two subdirectories under test_tmp_delphes/ (1) data/: which contains the data (2) experiments/: which will contain any trained model
+
+
+Instructions to explain using LRP (you must have an already trained model in test_tmp_delphes/experiments):
+```bash
+cd LRP/
+python -u main_reg.py --LRP_load_model=<your_model> --LRP_load_epoch=<your_epoch>
+```
diff --git a/mlpf/pytorch_delphes/__init__.py b/mlpf/pytorch_delphes/__init__.py
new file mode 100644
index 000000000..48f208f03
--- /dev/null
+++ b/mlpf/pytorch_delphes/__init__.py
@@ -0,0 +1,10 @@
+from pytorch_delphes.args import parse_args
+from pytorch_delphes.graph_data_delphes import PFGraphDataset, one_hot_embedding
+from pytorch_delphes.data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+
+from pytorch_delphes.model import PFNet7, PFNet7_opt
+from pytorch_delphes.gravnet import GravNetConv
+from pytorch_delphes.gravnet_optimized import GravNetConv_optimized
+
+from pytorch_delphes.training import train_loop
+from pytorch_delphes.evaluate import make_predictions
diff --git a/mlpf/pytorch_delphes/args.py b/mlpf/pytorch_delphes/args.py
new file mode 100644
index 000000000..9c44104fe
--- /dev/null
+++ b/mlpf/pytorch_delphes/args.py
@@ -0,0 +1,114 @@
+import argparse
+from math import inf
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dataset", type=str, default='../test_tmp_delphes/data/pythia8_ttbar', help="training dataset path", required=True)
+    parser.add_argument("--dataset_qcd", type=str, default='../test_tmp_delphes/data/pythia8_qcd', help="testing dataset path", required=True)
+    parser.add_argument("--outpath", type=str, default = '../test_tmp_delphes/experiments/', help="output folder", required=True)
+    parser.add_argument("--title", type=str, default='', help="Appends this title to the model's name")
+
+    parser.add_argument("--overwrite", action=BoolArg, default=False, help="Overwrites the model if True")
+    parser.add_argument("--optimized", action=BoolArg, default=False, help="Uses the optimized version of knn")
+
+    # for training
+    parser.add_argument("--train", action=BoolArg, default=True, help="Trains the model")
+    parser.add_argument("--n_train", type=int, default=3, help="number of data files to use for training.. each file contains 100 events")
+    parser.add_argument("--n_valid", type=int, default=1, help="number of data files to use for validation.. each file contains 100 events")
+    parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing.. each file contains 100 events")
+    parser.add_argument("--n_epochs", type=int, default=1, help="number of training epochs")
+    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
+
+    parser.add_argument("--hidden_dim", type=int, default=256, help="hidden dimension")
+    parser.add_argument("--hidden_dim_nn1", type=int, default=64, help="hidden dimension")
+    parser.add_argument("--input_encoding", type=int, default=12, help="use an input encoding layer")
+    parser.add_argument("--encoding_dim", type=int, default=64, help="encoded element dimension")
+    parser.add_argument("--space_dim", type=int, default=4, help="Spatial dimension for clustering in gravnet layer")
+    parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
+    parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
+    parser.add_argument("--nn1", action=BoolArg, default=True, help="Adds an encoder/decoder step before gravnet..")
+    parser.add_argument("--nn3", action=BoolArg, default=True, help="Adds the network to regress p4..")
+    parser.add_argument("--nn4", action=BoolArg, default=True, help="Adds an extra network for the dnn model..")
+
+    parser.add_argument("--patience", type=int, default=100, help="patience before early stopping")
+    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="gen")
+    parser.add_argument("--optimizer", type=str, default='adam', choices=["adam", "adamw"], help="optimizer to use")
+    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+    parser.add_argument("--alpha", type=float, default=2e-4, help="Loss multiplier for pdg-id classification.. recall: loss = clf + alpha*reg")
+
+    parser.add_argument("--classification_only", action=BoolArg, default=False, help="Check to train for classification only (no regression)")
+    parser.add_argument("--regression_only", action=BoolArg, default=False, help="Check to train for regression only (no classification)")
+
+    # for loading a pre-trained model
+    parser.add_argument("--load", action=BoolArg, default=False, help="Load the model (no training)")
+    parser.add_argument("--load_model", type=str, help="Which model to load", default="/PFNet7_cand_ntrain_2")
+    parser.add_argument("--load_epoch", type=int, default=0, help="Which epoch of the model to load for evaluation")
+
+    # for evaluation: making predictions & making plots
+    parser.add_argument("--make_predictions_train", action=BoolArg, default=False, help="make predictions on training data..")
+    parser.add_argument("--make_predictions_valid", action=BoolArg, default=False, help="make predictions on validation data..")
+    parser.add_argument("--make_predictions_test", action=BoolArg, default=True, help="make predictions on testing data..")
+    parser.add_argument("--make_plots_train", action=BoolArg, default=False, help="make plots on training data..")
+    parser.add_argument("--make_plots_valid", action=BoolArg, default=False, help="make plots on validation data..")
+    parser.add_argument("--make_plots_test", action=BoolArg, default=True, help="make plots on testing data..")
+
+    args = parser.parse_args()
+
+    return args
+
+
+class BoolArg(argparse.Action):
+    """
+    Take an argparse argument that is either a boolean or a string and return a boolean.
+    """
+    def __init__(self, default=None, nargs=None, *args, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+
+        # Set default
+        if default is None:
+            raise ValueError("Default must be set!")
+
+        default = _arg_to_bool(default)
+
+        super().__init__(*args, default=default, nargs='?', **kwargs)
+
+    def __call__(self, parser, namespace, argstring, option_string):
+
+        if argstring is not None:
+            # If called with an argument, convert to bool
+            argval = _arg_to_bool(argstring)
+        else:
+            # BoolArg will invert default option
+            argval = True
+
+        setattr(namespace, self.dest, argval)
+
+def _arg_to_bool(arg):
+    # Convert argument to boolean
+
+    if type(arg) is bool:
+        # If argument is bool, just return it
+        return arg
+
+    elif type(arg) is str:
+        # If string, convert to true/false
+        arg = arg.lower()
+        if arg in ['true', 't', '1']:
+            return True
+        elif arg in ['false', 'f', '0']:
+            return False
+        else:
+            return ValueError('Could not parse a True/False boolean')
+    else:
+        raise ValueError('Input must be boolean or string! {}'.format(type(arg)))
+
+
+# From https://stackoverflow.com/questions/12116685/how-can-i-require-my-python-scripts-argument-to-be-a-float-between-0-0-1-0-usin
+class Range(object):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+    def __eq__(self, other):
+        return self.start <= other <= self.end
diff --git a/mlpf/pytorch_delphes/data_preprocessing.py b/mlpf/pytorch_delphes/data_preprocessing.py
new file mode 100644
index 000000000..342456cd7
--- /dev/null
+++ b/mlpf/pytorch_delphes/data_preprocessing.py
@@ -0,0 +1,52 @@
+import numpy as np
+import torch
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+
+# if not multigpu we have to pass batches that are stacked as "batch.type() = Batch" (not list) so that pytorch can access attributes like ygen_id through batch.ygen_id
+# if multigpu we have to pass list of "Data" elements.. then behind the scene, pytorch DP will convert the list to appropriate Batches to fit on the gpus available so that batch.ygen_id works out of the box
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+# define a function that casts the ttbar dataset into a dataloader for efficient NN training
+def data_to_loader_ttbar(full_dataset, n_train, n_valid, batch_size):
+
+    # https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html
+    train_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_train))
+    valid_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=n_train, stop=n_train+n_valid))
+
+    # preprocessing the train_dataset in a good format for passing correct batches of events to the GNN
+    train_data=[]
+    for i in range(len(train_dataset)):
+        train_data = train_data + train_dataset[i]
+
+    # preprocessing the valid_dataset in a good format for passing correct batches of events to the GNN
+    valid_data=[]
+    for i in range(len(valid_dataset)):
+        valid_data = valid_data + valid_dataset[i]
+
+    if not multi_gpu:
+        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
+        valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
+    else:
+        #https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/data_parallel.html
+        train_loader = DataListLoader(train_data, batch_size=batch_size, shuffle=True)
+        valid_loader = DataListLoader(valid_data, batch_size=batch_size, shuffle=True)
+
+    return train_loader, valid_loader
+
+def data_to_loader_qcd(full_dataset, n_test, batch_size):
+
+    test_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_test))
+
+    # preprocessing the test_dataset in a good format for passing correct batches of events to the GNN
+    test_data=[]
+    for i in range(len(test_dataset)):
+        test_data = test_data + test_dataset[i]
+
+    if not multi_gpu:
+        test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
+    else:
+        #https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/data_parallel.html
+        test_loader = DataListLoader(test_data, batch_size=batch_size, shuffle=True)
+
+    return test_loader
diff --git a/mlpf/pytorch_delphes/evaluate.py b/mlpf/pytorch_delphes/evaluate.py
new file mode 100644
index 000000000..c5273c733
--- /dev/null
+++ b/mlpf/pytorch_delphes/evaluate.py
@@ -0,0 +1,113 @@
+import pickle as pkl
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+import torch
+
+import pytorch_delphes
+
+def make_predictions(model, multi_gpu, test_loader, outpath, target, device, epoch, which_data):
+
+    print('Making predictions on ' + which_data)
+    t0=time.time()
+
+    gen_list = {"null":[], "chhadron":[], "nhadron":[], "photon":[], "electron":[], "muon":[]}
+    pred_list = {"null":[], "chhadron":[], "nhadron":[], "photon":[], "electron":[], "muon":[]}
+    cand_list = {"null":[], "chhadron":[], "nhadron":[], "photon":[], "electron":[], "muon":[]}
+
+    for i, batch in enumerate(test_loader):
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot.detach(), -1)
+        _, pred_ids = torch.max(pred_ids_one_hot.detach(), -1)
+        _, cand_ids = torch.max(cand_ids_one_hot.detach(), -1)
+
+        # to make "num_gen vs num_pred" plots
+        gen_list["null"].append((gen_ids==0).sum().item())
+        gen_list["chhadron"].append((gen_ids==1).sum().item())
+        gen_list["nhadron"].append((gen_ids==2).sum().item())
+        gen_list["photon"].append((gen_ids==3).sum().item())
+        gen_list["electron"].append((gen_ids==4).sum().item())
+        gen_list["muon"].append((gen_ids==5).sum().item())
+
+        pred_list["null"].append((pred_ids==0).sum().item())
+        pred_list["chhadron"].append((pred_ids==1).sum().item())
+        pred_list["nhadron"].append((pred_ids==2).sum().item())
+        pred_list["photon"].append((pred_ids==3).sum().item())
+        pred_list["electron"].append((pred_ids==4).sum().item())
+        pred_list["muon"].append((pred_ids==5).sum().item())
+
+        cand_list["null"].append((cand_ids==0).sum().item())
+        cand_list["chhadron"].append((cand_ids==1).sum().item())
+        cand_list["nhadron"].append((cand_ids==2).sum().item())
+        cand_list["photon"].append((cand_ids==3).sum().item())
+        cand_list["electron"].append((cand_ids==4).sum().item())
+        cand_list["muon"].append((cand_ids==5).sum().item())
+
+        gen_p4 = gen_p4.detach()
+        pred_p4 = pred_p4.detach()
+        cand_p4 = cand_p4.detach()
+
+        if i==0:
+            gen_ids_all = gen_ids
+            gen_p4_all = gen_p4
+
+            pred_ids_all = pred_ids
+            pred_p4_all = pred_p4
+
+            cand_ids_all = cand_ids
+            cand_p4_all = cand_p4
+        else:
+            gen_ids_all = torch.cat([gen_ids_all,gen_ids])
+            gen_p4_all = torch.cat([gen_p4_all,gen_p4])
+
+            pred_ids_all = torch.cat([pred_ids_all,pred_ids])
+            pred_p4_all = torch.cat([pred_p4_all,pred_p4])
+
+            cand_ids_all = torch.cat([cand_ids_all,cand_ids])
+            cand_p4_all = torch.cat([cand_p4_all,cand_p4])
+
+        if len(test_loader)<5000:
+            print(f'event #: {i+1}/{len(test_loader)}')
+        else:
+            print(f'event #: {i+1}/{5000}')
+
+        if i==4999:
+            break
+
+    t1=time.time()
+
+    print('Time taken to make predictions is:', round(((t1-t0)/60),2), 'min')
+
+    # store the 3 list dictionaries in a list (this is done only to compute the particle multiplicity plots)
+    list = [pred_list, gen_list, cand_list]
+
+    torch.save(list, outpath + '/list_for_multiplicities.pt')
+
+    torch.save(gen_ids_all, outpath + '/gen_ids.pt')
+    torch.save(gen_p4_all, outpath + '/gen_p4.pt')
+    torch.save(pred_ids_all, outpath + '/pred_ids.pt')
+    torch.save(pred_p4_all, outpath + '/pred_p4.pt')
+    torch.save(cand_ids_all, outpath + '/cand_ids.pt')
+    torch.save(cand_p4_all, outpath + '/cand_p4.pt')
+
+    ygen = torch.cat([gen_ids_all.reshape(-1,1).float(),gen_p4_all], axis=1)
+    ypred = torch.cat([pred_ids_all.reshape(-1,1).float(),pred_p4_all], axis=1)
+    ycand = torch.cat([cand_ids_all.reshape(-1,1).float(),cand_p4_all], axis=1)
+
+    # store the actual predictions to make all the other plots
+    predictions = {"ygen":ygen.reshape(1,-1,7).detach().cpu().numpy(), "ycand":ycand.reshape(1,-1,7).detach().cpu().numpy(), "ypred":ypred.detach().reshape(1,-1,7).cpu().numpy()}
+
+    torch.save(predictions, outpath + '/predictions.pt')
diff --git a/mlpf/pytorch/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
similarity index 85%
rename from mlpf/pytorch/graph_data_delphes.py
rename to mlpf/pytorch_delphes/graph_data_delphes.py
index e7110038b..ae14461f9 100644
--- a/mlpf/pytorch/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -5,15 +5,9 @@
 import torch_geometric
 import torch_geometric.utils
 from torch_geometric.data import Dataset, Data, Batch
-import itertools
 from glob import glob
-from numpy.lib.recfunctions import append_fields
-import bz2
 
 import pickle
-import scipy
-import scipy.sparse
-import math
 import multiprocessing
 
 # assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
@@ -53,8 +47,7 @@ def __init__(self, root, transform=None, pre_transform=None):
 
     @property
     def raw_file_names(self):
-        raw_list = list(glob(osp.join(self.raw_dir, '*.pkl')))
-        raw_list += list(glob(osp.join(self.raw_dir, '*.pkl.bz2')))
+        raw_list = glob(osp.join(self.raw_dir, '*.pkl'))
         print("PFGraphDataset nfiles={}".format(len(raw_list)))
         return sorted([l.replace(self.raw_dir, '.') for l in raw_list])
 
@@ -81,13 +74,8 @@ def download(self):
         pass
 
     def process_single_file(self, raw_file_name):
-        if raw_file_name.endswith(".pkl"):
-            with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi:
-                data = pickle.load(fi, encoding='iso-8859-1')
-        elif raw_file_name.endswith(".pkl.bz2"):
-            data = pickle.load(bz2.BZ2File(osp.join(self.raw_dir, raw_file_name), "rb"), encoding='iso-8859-1')
-        else:
-            raise Exception("Unknown file format")
+        with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi:
+            data = pickle.load(fi, encoding='iso-8859-1')
 
         x=[]
         ygen=[]
@@ -152,22 +140,25 @@ def get(self, idx):
     def __getitem__(self, idx):
         return self.get(idx)
 
-if __name__ == "__main__":
+def parse_args():
     import argparse
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", type=str, help="dataset path", required=True)
+    parser.add_argument("--dataset", type=str, required=True, help="Input data path")
     parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
     parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
     parser.add_argument("--num-proc", type=int, default=24, help="number of processes")
     args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
 
     pfgraphdataset = PFGraphDataset(root=args.dataset)
 
     if args.processed_dir:
         pfgraphdataset._processed_dir = args.processed_dir
-    
-    if not os.path.isdir(pfgraphdataset._processed_dir):
-        os.makedirs(pfgraphdataset._processed_dir)
 
     pfgraphdataset.process_parallel(args.num_files_merge,args.num_proc)
     #pfgraphdataset.process(args.num_files_merge)
diff --git a/mlpf/pytorch_delphes/gravnet.py b/mlpf/pytorch_delphes/gravnet.py
new file mode 100644
index 000000000..5b26d3954
--- /dev/null
+++ b/mlpf/pytorch_delphes/gravnet.py
@@ -0,0 +1,122 @@
+from typing import Optional, Union
+from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
+import time
+
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+
+try:
+    from torch_cluster import knn
+except ImportError:
+    knn = None
+
+# copied it from pytorch_geometric source code
+# ADDED: retrieve edge_index, retrieve edge_weight
+# CHANGED: self.lin -> self.lin_p
+# CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
+# REMOVED: skip connection
+
+class GravNetConv(MessagePassing):
+    r"""The GravNet operator from the `"Learning Representations of Irregular
+    Particle-detector Geometry with Distance-weighted Graph
+    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
+    dynamically constructed using nearest neighbors.
+    The neighbors are constructed in a learnable low-dimensional projection of
+    the feature space.
+    A second projection of the input feature space is then propagated from the
+    neighbors to each vertex using distance weights that are derived by
+    applying a Gaussian function to the distances.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        space_dimensions (int): The dimensionality of the space used to
+           construct the neighbors; referred to as :math:`S` in the paper.
+        propagate_dimensions (int): The number of features to be propagated
+           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
+           paper.
+        k (int): The number of nearest neighbors.
+        num_workers (int): Number of workers to use for k-NN computation.
+            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
+            lies on the GPU. (default: :obj:`1`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 space_dimensions: int, propagate_dimensions: int, k: int,
+                 num_workers: int = 1, **kwargs):
+        super(GravNetConv, self).__init__(flow='target_to_source', **kwargs)
+
+        if knn is None:
+            raise ImportError('`GravNetConv` requires `torch-cluster`.')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.k = k
+        self.num_workers = num_workers
+
+        self.lin_s = Linear(in_channels, space_dimensions)
+        self.lin_h = Linear(in_channels, propagate_dimensions)
+        self.lin_p = Linear(propagate_dimensions, out_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_s.reset_parameters()
+        self.lin_h.reset_parameters()
+        self.lin_p.reset_parameters()
+
+
+    def forward(
+            self, x: Union[Tensor, PairTensor],
+            batch: Union[OptTensor, Optional[PairTensor]] = None) -> Tensor:
+        """"""
+
+        is_bipartite: bool = True
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+            is_bipartite = False
+        assert x[0].dim() == 2, 'Static graphs not supported in `GravNetConv`.'
+
+        b: PairOptTensor = (None, None)
+        if isinstance(batch, Tensor):
+            b = (batch, batch)
+        elif isinstance(batch, tuple):
+            assert batch is not None
+            b = (batch[0], batch[1])
+
+        h_l: Tensor = self.lin_h(x[0])
+
+        s_l: Tensor = self.lin_s(x[0])
+        s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
+
+        edge_index = knn(s_l, s_r, self.k, b[0], b[1],
+                         num_workers=self.num_workers)
+
+        edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
+        edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
+
+        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
+        out = self.propagate(edge_index, x=(h_l, None),
+                             edge_weight=edge_weight,
+                             size=(s_l.size(0), s_r.size(0)))
+
+        return self.lin_p(out), edge_index, edge_weight
+
+
+    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
+        return x_j * edge_weight.unsqueeze(1)
+
+    def aggregate(self, inputs: Tensor, index: Tensor,
+                  dim_size: Optional[int] = None) -> Tensor:
+        out_mean = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                           reduce='sum')
+        return out_mean
+
+    def __repr__(self):
+        return '{}({}, {}, k={})'.format(self.__class__.__name__,
+                                         self.in_channels, self.out_channels,
+                                         self.k)
diff --git a/mlpf/pytorch_delphes/gravnet_optimized.py b/mlpf/pytorch_delphes/gravnet_optimized.py
new file mode 100644
index 000000000..1d4e17bd2
--- /dev/null
+++ b/mlpf/pytorch_delphes/gravnet_optimized.py
@@ -0,0 +1,124 @@
+from typing import Optional, Union
+from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
+import time
+
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+
+########### (1) clone this repo:  https://github.com/mandylee900125/pytorch_cmspepr.git ###############
+########### (2) do <pip intsall .> from inside ###############
+
+try:
+    from torch_cmspepr import knn_graph
+except ImportError:
+    knn_graph = None
+
+# copied it from pytorch_geometric source code
+# ADDED: retrieve edge_index, retrieve edge_weight
+# CHANGED: self.lin -> self.lin_p
+# CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
+# REMOVED: skip connection
+# REPLACED: knn with knn_graph
+
+class GravNetConv_optimized(MessagePassing):
+    r"""The GravNet operator from the `"Learning Representations of Irregular
+    Particle-detector Geometry with Distance-weighted Graph
+    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
+    dynamically constructed using nearest neighbors.
+    The neighbors are constructed in a learnable low-dimensional projection of
+    the feature space.
+    A second projection of the input feature space is then propagated from the
+    neighbors to each vertex using distance weights that are derived by
+    applying a Gaussian function to the distances.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        space_dimensions (int): The dimensionality of the space used to
+           construct the neighbors; referred to as :math:`S` in the paper.
+        propagate_dimensions (int): The number of features to be propagated
+           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
+           paper.
+        k (int): The number of nearest neighbors.
+        num_workers (int): Number of workers to use for k-NN computation.
+            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
+            lies on the GPU. (default: :obj:`1`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 space_dimensions: int, propagate_dimensions: int, k: int,
+                 num_workers: int = 1, **kwargs):
+        super(GravNetConv_optimized, self).__init__(flow='target_to_source', **kwargs)
+
+        if knn_graph is None:
+            raise ImportError('`GravNetConv_optimized` requires `torch_cmspepr`.')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.k = k
+        self.num_workers = num_workers
+
+        self.lin_s = Linear(in_channels, space_dimensions)
+        self.lin_h = Linear(in_channels, propagate_dimensions)
+        self.lin_p = Linear(propagate_dimensions, out_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_s.reset_parameters()
+        self.lin_h.reset_parameters()
+        self.lin_p.reset_parameters()
+
+
+    def forward(
+            self, x: Union[Tensor, PairTensor],
+            batch: Union[OptTensor, Optional[PairTensor]] = None) -> Tensor:
+        """"""
+
+        is_bipartite: bool = True
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+            is_bipartite = False
+        assert x[0].dim() == 2, 'Static graphs not supported in `GravNetConv`.'
+
+        b: PairOptTensor = (None, None)
+        if isinstance(batch, Tensor):
+            b = (batch, batch)
+        elif isinstance(batch, tuple):
+            assert batch is not None
+            b = (batch[0], batch[1])
+
+        h_l: Tensor = self.lin_h(x[0])
+
+        s_l: Tensor = self.lin_s(x[0])
+        s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
+        edge_index = knn_graph(s_l, self.k, b[0])##########################CHANGED###################################
+
+        edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
+        edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
+
+        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
+        out = self.propagate(edge_index, x=(h_l, None),
+                             edge_weight=edge_weight,
+                             size=(s_l.size(0), s_r.size(0)))
+
+        return self.lin_p(out), edge_index, edge_weight
+
+
+    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
+        return x_j * edge_weight.unsqueeze(1)
+
+    def aggregate(self, inputs: Tensor, index: Tensor,
+                  dim_size: Optional[int] = None) -> Tensor:
+        out_mean = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                           reduce='sum')
+        return out_mean
+
+    def __repr__(self):
+        return '{}({}, {}, k={})'.format(self.__class__.__name__,
+                                         self.in_channels, self.out_channels,
+                                         self.k)
diff --git a/mlpf/pytorch_delphes/model.py b/mlpf/pytorch_delphes/model.py
new file mode 100644
index 000000000..608db6eae
--- /dev/null
+++ b/mlpf/pytorch_delphes/model.py
@@ -0,0 +1,172 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+import pytorch_delphes
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True):
+
+        super(PFNet7, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+        self.elu = nn.ELU
+
+        # (1) DNN: encoding/decoding of all tracks and clusters
+        if self.nn1:
+            self.nn1 = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding),
+            )
+        # (2) CNN: Gravnet layer
+        self.conv1 = pytorch_delphes.GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+        # (4) DNN layer: regressing p4
+        if self.nn3:
+            self.nn3 = nn.Sequential(
+                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, output_dim_p4),
+            )
+    def forward(self, data):
+        x0 = data.x
+
+        # Encoder/Decoder step
+        if self.nn1:
+            x = self.nn1(x0)
+        else:
+            x=x0
+
+        # Gravnet step
+        x, edge_index, edge_weight = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        if self.nn3:
+            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
+
+
+class PFNet7_opt(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True):
+
+        super(PFNet7_opt, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+        self.elu = nn.ELU
+
+        # (1) DNN: encoding/decoding of all tracks and clusters
+        if self.nn1:
+            self.nn1 = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding),
+            )
+        # (2) CNN: Gravnet layer
+        self.conv1 = pytorch_delphes.GravNetConv_optimized(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+        # (4) DNN layer: regressing p4
+        if self.nn3:
+            self.nn3 = nn.Sequential(
+                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, output_dim_p4),
+            )
+    def forward(self, data):
+        x0 = data.x
+
+        # Encoder/Decoder step
+        if self.nn1:
+            x = self.nn1(x0)
+        else:
+            x=x0
+
+        # Gravnet step
+        x, edge_index, edge_weight = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        if self.nn3:
+            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
diff --git a/mlpf/pytorch_delphes/training.py b/mlpf/pytorch_delphes/training.py
new file mode 100644
index 000000000..6127696f0
--- /dev/null
+++ b/mlpf/pytorch_delphes/training.py
@@ -0,0 +1,231 @@
+import os
+import pickle as pkl
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+import torch
+
+import pytorch_delphes
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+def compute_weights(gen_ids_one_hot, device, output_dim_id):
+    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
+    weights = torch.zeros(output_dim_id).to(device=device)
+    for k, v in zip(vs, cs):
+        weights[k] = 1.0/math.sqrt(float(v))
+    return weights
+
+def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
+    plt.style.use(hep.style.ROOT)
+
+    if not os.path.exists(outpath + '/training_plots/'):
+        os.makedirs(outpath + '/training_plots/')
+
+    fig, ax = plt.subplots()
+    ax.plot(range(len(l)), l, label=label)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.legend(loc='best')
+    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
+    plt.close(fig)
+
+    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
+        pkl.dump(l, f)
+
+@torch.no_grad()
+def test(model, multi_gpu, loader, epoch, alpha, target_type, device, output_dim_id, classification_only, outpath):
+    with torch.no_grad():
+        ret = train(model, multi_gpu, loader, epoch, None, alpha, target_type, device, output_dim_id, classification_only, outpath)
+    return ret
+
+def train(model, multi_gpu, loader, epoch, optimizer, alpha, target_type, device, output_dim_id, classification_only, outpath):
+
+    is_train = not (optimizer is None)
+
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+
+    #loss values for each batch: classification, regression, total
+    losses_1, losses_2, losses_tot = [], [], []
+
+    #accuracy values for each batch (monitor classification performance)
+    accuracies_batch, accuracies_batch_msk = [], []
+
+    #setup confusion matrix
+    conf_matrix = np.zeros((output_dim_id, output_dim_id))
+
+    # to compute average inference time
+    t=[]
+
+    for i, batch in enumerate(loader):
+        t0 = time.time()
+
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        ## make like tensorflow model, 0-padding events to 6k elements
+        # if X.x.shape[0]<6000:
+        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
+        #
+        #     X.x = new_X
+        #     X.ygen_id=new_ygen_id
+
+        # Forwardprop
+        if i<100:
+            ti = time.time()
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+            tf = time.time()
+            if i!=0:
+                t.append(round((tf-ti),2))
+        else:
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot, -1)
+        _, pred_ids = torch.max(pred_ids_one_hot, -1)
+        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
+
+        # masking
+        msk = ((pred_ids != 0) & (gen_ids != 0))
+        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
+
+        # computing loss
+        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device, output_dim_id)
+        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
+        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
+
+        if classification_only:
+            loss = l1
+        else:
+            loss = l1+l2
+
+        if is_train:
+            # BACKPROP
+            #print(list(model.parameters())[1].grad)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        losses_1.append(l1.detach().cpu().item())
+        losses_2.append(l2.detach().cpu().item())
+        losses_tot.append(loss.detach().cpu().item())
+
+        t1 = time.time()
+
+        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
+        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
+
+        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
+                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
+
+        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
+
+    print("Average Inference time per event is: ", round((sum(t) / len(t)),2), 's')
+
+    losses_1 = np.mean(losses_1)
+    losses_2 = np.mean(losses_2)
+    losses_tot = np.mean(losses_tot)
+
+    acc = np.mean(accuracies_batch)
+    acc_msk = np.mean(accuracies_batch_msk)
+
+    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
+
+    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
+
+
+def train_loop(model, device, multi_gpu, train_loader, valid_loader, test_loader, n_epochs, patience, optimizer, alpha, target, output_dim_id, classification_only, outpath):
+    t0_initial = time.time()
+
+    losses_1_train, losses_2_train, losses_tot_train = [], [], []
+    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
+
+    accuracies_train, accuracies_msk_train = [], []
+    accuracies_valid, accuracies_msk_valid = [], []
+
+    best_val_loss = 99999.9
+    stale_epochs = 0
+
+    print("Training over {} epochs".format(n_epochs))
+    for epoch in range(n_epochs):
+        t0 = time.time()
+
+        if stale_epochs > patience:
+            print("breaking due to stale epochs")
+            break
+
+        # training epoch
+        model.train()
+        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, multi_gpu, train_loader, epoch, optimizer, alpha, target, device, output_dim_id, classification_only, outpath)
+
+        losses_tot_train.append(losses_tot)
+        losses_1_train.append(losses_1)
+        losses_2_train.append(losses_2)
+
+        accuracies_train.append(acc)
+        accuracies_msk_train.append(acc_msk)
+
+        # validation step
+        model.eval()
+        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, multi_gpu, valid_loader, epoch, alpha, target, device, output_dim_id, classification_only, outpath)
+
+        losses_tot_valid.append(losses_tot_v)
+        losses_1_valid.append(losses_1_v)
+        losses_2_valid.append(losses_2_v)
+
+        accuracies_valid.append(acc_v)
+        accuracies_msk_valid.append(acc_msk_v)
+
+        # early-stopping
+        if losses_tot_v < best_val_loss:
+            best_val_loss = losses_tot_v
+            stale_epochs = 0
+        else:
+            stale_epochs += 1
+
+        t1 = time.time()
+
+        epochs_remaining = n_epochs - (epoch+1)
+        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
+        eta = epochs_remaining*time_per_epoch/60
+
+        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
+            epoch+1, n_epochs,
+            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
+            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
+
+        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
+
+        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
+        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
+
+    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
+    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
+    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
+
+    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
+    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
+    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
+
+    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
+    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
+
+    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
+    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
+
+    print('Done with training.')
+    return
diff --git a/mlpf/pytorch_pipeline.py b/mlpf/pytorch_pipeline.py
new file mode 100644
index 000000000..476832fea
--- /dev/null
+++ b/mlpf/pytorch_pipeline.py
@@ -0,0 +1,267 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+
+from pytorch_delphes import parse_args, PFGraphDataset, data_to_loader_ttbar, data_to_loader_qcd, PFNet7, PFNet7_opt, train_loop, make_predictions
+from plotting import make_plots
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        alpha,
+        task,
+        title)
+    return model_fname
+
+def make_directories_for_plots(outpath, which_data):
+    if not osp.isdir(outpath+'/' + which_data + '_loader'):
+        os.makedirs(outpath+'/' + which_data + '_loader')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/resolution_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/resolution_plots')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/distribution_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/distribution_plots')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/efficiency_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/efficiency_plots')
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 5, 'batch_size': 1,
+    # 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64, 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16,
+    # 'patience': 100, 'target': 'gen', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    # 'dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../test_tmp_delphes/experiments/yee/', 'title': 'noembeddings',
+    # 'classification_only': False, 'nn1': True, 'nn3': True,
+    # 'load': True, 'load_epoch': 14, 'load_model': 'PFNet7_opt_gen_ntrain_1_nepochs_15_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'make_predictions_train': False, 'make_plots_train': False,
+    # 'make_predictions_valid': False, 'make_plots_valid': False,
+    # 'make_predictions_test': True, 'make_plots_test': True,
+    # 'optimized': False, 'overwrite': False})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    if args.optimized:
+        model_class = PFNet7_opt
+    else:
+        model_class = PFNet7
+
+    if args.load:
+            outpath = args.outpath + args.load_model
+            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+            print('Loading a previously trained model..')
+            with open(outpath + '/model_kwargs.pkl', 'rb') as f:
+                model_kwargs = pkl.load(f)
+
+            model = model_class(**model_kwargs)
+
+            state_dict = torch.load(PATH, map_location=device)
+
+            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
+                state_dict = torch.load(PATH, map_location=device)
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove module.
+                    new_state_dict[name] = v
+                    # print('name is:', name)
+                state_dict=new_state_dict
+
+            model.load_state_dict(state_dict)
+
+            if multi_gpu:
+                model = torch_geometric.nn.DataParallel(model)
+                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+            model.to(device)
+
+            if args.train:
+                print("Training a previously trained model..")
+
+    elif args.train:
+        #instantiate the model
+        print('Instantiating a model..')
+        model_kwargs = {'input_dim': input_dim,
+                        'hidden_dim': args.hidden_dim,
+                        'hidden_dim_nn1': args.hidden_dim_nn1,
+                        'input_encoding': args.input_encoding,
+                        'encoding_dim': args.encoding_dim,
+                        'output_dim_id': output_dim_id,
+                        'output_dim_p4': output_dim_p4,
+                        'space_dim': args.space_dim,
+                        'propagate_dimensions': args.propagate_dimensions,
+                        'nearest': args.nearest,
+                        'target': args.target,
+                        'nn1': args.nn1,
+                        'nn3': args.nn3}
+
+        model = model_class(**model_kwargs)
+
+        if multi_gpu:
+            print("Parallelizing the training..")
+            model = torch_geometric.nn.DataParallel(model)
+            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+        model.to(device)
+
+    if args.train:
+        if args.nn1:
+            args.title=args.title+'_nn1'
+        if args.nn3:
+            args.title=args.title+'_nn3'
+        if args.load:
+            args.title=args.title+'_retrain'
+
+        if args.classification_only:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
+        else:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
+
+        outpath = osp.join(args.outpath, model_fname)
+        if osp.isdir(outpath):
+            if args.overwrite:
+                print("model output {} already exists, deleting it".format(outpath))
+                import shutil
+                shutil.rmtree(outpath)
+            else:
+                print("model output {} already exists, please delete it".format(outpath))
+                sys.exit(0)
+        try:
+            os.makedirs(outpath)
+        except Exception as e:
+            pass
+
+        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
+            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
+
+        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
+            os.makedirs(outpath + '/confusion_matrix_plots/')
+
+        if args.optimizer == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+        elif args.optimizer == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+
+        print(model)
+        print(model_fname)
+
+        model.train()
+        train_loop(model, device, multi_gpu,
+                   train_loader, valid_loader, test_loader,
+                   args.n_epochs, args.patience, optimizer, args.alpha, args.target,
+                   output_dim_id, args.classification_only, outpath)
+
+    model.eval()
+
+    # evaluate on training data..
+    make_directories_for_plots(outpath, 'train')
+    if args.make_predictions_train:
+        make_predictions(model, multi_gpu, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+    if args.make_plots_train:
+        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+
+    # evaluate on validation data..
+    make_directories_for_plots(outpath, 'valid')
+    if args.make_predictions_valid:
+        make_predictions(model, multi_gpu, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+    if args.make_plots_valid:
+        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+
+    # evaluate on testing data..
+    make_directories_for_plots(outpath, 'test')
+    if args.make_predictions_test:
+        if args.load:
+            make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+    if args.make_plots_test:
+        if args.load:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+
+
+## -----------------------------------------------------------
+# to retrieve a stored variable in pkl file
+# import pickle as pkl
+# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     a = pkl.load(f)
+#
+# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
+#     data = pkl.load(pickle_file)
+#
+# data.keys()
diff --git a/mlpf/tallinn/cms-gnn-dense.sh b/mlpf/tallinn/cms-dev.sh
similarity index 81%
rename from mlpf/tallinn/cms-gnn-dense.sh
rename to mlpf/tallinn/cms-dev.sh
index 423ccf8e2..4061d4b85 100755
--- a/mlpf/tallinn/cms-gnn-dense.sh
+++ b/mlpf/tallinn/cms-dev.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 #SBATCH -p gpu
-#SBATCH --gpus 5
+#SBATCH --gpus 4
 #SBATCH --mem-per-gpu=8G
 
 IMG=/home/software/singularity/base.simg:latest
 cd ~/particleflow
 
 #TF training
-singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-gnn-dense.yaml
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-dev.yaml
diff --git a/mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh b/mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh
deleted file mode 100755
index 4d01fc3be..000000000
--- a/mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec -B /home -B /scratch-persistent --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gen-gnn-skipconn-v2.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-dense-transfer.sh b/mlpf/tallinn/cms-gen.sh
similarity index 53%
rename from mlpf/tallinn/cms-gnn-dense-transfer.sh
rename to mlpf/tallinn/cms-gen.sh
index 203a79635..d2eddde9f 100755
--- a/mlpf/tallinn/cms-gnn-dense-transfer.sh
+++ b/mlpf/tallinn/cms-gen.sh
@@ -7,4 +7,4 @@ IMG=/home/software/singularity/base.simg:latest
 cd ~/particleflow
 
 #TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-dense-transfer.yaml --action train --recreate
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-gen.yaml
diff --git a/mlpf/tallinn/cms-gnn-dense-focal-retrain.sh b/mlpf/tallinn/cms-gnn-dense-focal-retrain.sh
deleted file mode 100755
index dfc675c1a..000000000
--- a/mlpf/tallinn/cms-gnn-dense-focal-retrain.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-dense-focal.yaml --action train --modifier retrain_energy --recreate --weights experiments/cms-gnn-dense-focal-285ae825.gpu0.local/weights-300-1.175282.hdf5
diff --git a/mlpf/tallinn/cms-gnn-dense-focal.sh b/mlpf/tallinn/cms-gnn-dense-focal.sh
deleted file mode 100755
index 765fcec67..000000000
--- a/mlpf/tallinn/cms-gnn-dense-focal.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-dense-focal.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-skipconn.sh b/mlpf/tallinn/cms-gnn-skipconn.sh
deleted file mode 100755
index 15205e9ce..000000000
--- a/mlpf/tallinn/cms-gnn-skipconn.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-skipconn.yaml --action train --weights experiments/cms-gnn-skipconn-9f17890f/weights-500-0.994515.hdf5
-#CUDA_VISIBLE_DEVICES=0 singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-skipconn.yaml --action eval --weights experiments/cms-gnn-skipconn-6cfe8834/weights-328-1.010852.hdf5
diff --git a/mlpf/tallinn/cms-transformer-skipconn-gun.sh b/mlpf/tallinn/cms-transformer-skipconn-gun.sh
deleted file mode 100755
index 43a8bab6f..000000000
--- a/mlpf/tallinn/cms-transformer-skipconn-gun.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 2
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-transformer-skipconn-gun.yaml --action train
diff --git a/mlpf/tallinn/cms-transformer-skipconn.sh b/mlpf/tallinn/cms-transformer-skipconn.sh
deleted file mode 100755
index 854905880..000000000
--- a/mlpf/tallinn/cms-transformer-skipconn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-transformer-skipconn.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-skipconn-v2.sh b/mlpf/tallinn/cms.sh
similarity index 50%
rename from mlpf/tallinn/cms-gnn-skipconn-v2.sh
rename to mlpf/tallinn/cms.sh
index 5b3a5563f..9e614cc5d 100755
--- a/mlpf/tallinn/cms-gnn-skipconn-v2.sh
+++ b/mlpf/tallinn/cms.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 #SBATCH -p gpu
-#SBATCH --gpus 5
+#SBATCH --gpus 4
 #SBATCH --mem-per-gpu=8G
 
 IMG=/home/software/singularity/base.simg:latest
 cd ~/particleflow
 
 #TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-skipconn-v2.yaml --action train
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms.yaml
diff --git a/mlpf/tallinn/delphes-dense.sh b/mlpf/tallinn/delphes-dense.sh
deleted file mode 100755
index 9e4a497de..000000000
--- a/mlpf/tallinn/delphes-dense.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow/delphes
-
-#TF training
-singularity exec --nv $IMG python3 ../mlpf/tensorflow/delphes_model.py --model-spec parameters/delphes-dense.yaml --action train
diff --git a/mlpf/tallinn/delphes-gnn-skipconn.sh b/mlpf/tallinn/delphes-gnn-skipconn.sh
deleted file mode 100755
index fa1857855..000000000
--- a/mlpf/tallinn/delphes-gnn-skipconn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/delphes-gnn-skipconn.yaml --action train
diff --git a/mlpf/tallinn/delphes-gnn.sh b/mlpf/tallinn/delphes-gnn.sh
deleted file mode 100755
index db2f3739e..000000000
--- a/mlpf/tallinn/delphes-gnn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 2
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow/delphes
-
-#TF training
-singularity exec --nv $IMG python3 ../mlpf/tensorflow/delphes_model.py parameters/delphes-gnn.yaml
diff --git a/mlpf/tallinn/delphes-transformer-skipconn.sh b/mlpf/tallinn/delphes-transformer-skipconn.sh
deleted file mode 100755
index cd096c986..000000000
--- a/mlpf/tallinn/delphes-transformer-skipconn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/delphes-transformer-skipconn.yaml --action train
diff --git a/mlpf/tallinn/delphes-transformer.sh b/mlpf/tallinn/delphes-transformer.sh
deleted file mode 100755
index 6ffbb9675..000000000
--- a/mlpf/tallinn/delphes-transformer.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 4
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow/delphes
-
-#TF training
-singularity exec --nv $IMG python3 ../mlpf/tensorflow/delphes_model.py parameters/delphes-transformer.yaml
diff --git a/mlpf/tallinn/opt_master.sh b/mlpf/tallinn/opt_master.sh
deleted file mode 100755
index 383de9ad6..000000000
--- a/mlpf/tallinn/opt_master.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 0
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-export SINGULARITYENV_KERASTUNER_TUNER_ID="chief"
-export SINGULARITYENV_KERASTUNER_ORACLE_IP="127.0.0.1"
-export SINGULARITYENV_KERASTUNER_ORACLE_PORT="8000"
-singularity exec -B /scratch $IMG python3 mlpf/tensorflow/opt.py
diff --git a/mlpf/tallinn/opt_tuner.sh b/mlpf/tallinn/opt_tuner.sh
deleted file mode 100755
index 80aedae22..000000000
--- a/mlpf/tallinn/opt_tuner.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 1
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#export SINGULARITYENV_KERASTUNER_TUNER_ID="tuner-${SLURM_JOB_ID}"
-#export SINGULARITYENV_KERASTUNER_ORACLE_IP="127.0.0.1"
-#export SINGULARITYENV_KERASTUNER_ORACLE_PORT="8000"
-
-singularity exec -B /scratch --nv $IMG python3 mlpf/tensorflow/opt.py
diff --git a/mlpf/tallinn/test-gnn.sh b/mlpf/tallinn/test-gnn.sh
new file mode 100755
index 000000000..15017b885
--- /dev/null
+++ b/mlpf/tallinn/test-gnn.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH -p gpu
+#SBATCH --gpus 1
+#SBATCH --mem-per-gpu=8G
+
+IMG=/home/software/singularity/base.simg:latest
+cd ~/particleflow
+
+#TF training
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-0l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-lsh-1l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-lsh-2l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-lsh-3l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-nolsh-1l.yaml --plot-freq 10
diff --git a/mlpf/tallinn/train.sh b/mlpf/tallinn/train.sh
deleted file mode 100755
index bb0f5cddf..000000000
--- a/mlpf/tallinn/train.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 1
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/tensorflow/tf_model.py \
-  --datapath ./data/TTbar_14TeV_TuneCUETP8M1_cfi \
-  --target cand --ntrain 70000 --ntest 20000 --convlayer ghconv \
-  --lr 1e-5 --nepochs 100 --num-neighbors 10 \
-  --num-hidden-id-enc 1 --num-hidden-id-dec 2 \
-  --num-hidden-reg-enc 1 --num-hidden-reg-dec 2 \
-  --bin-size 100 --hidden-dim-id 256 --hidden-dim-reg 256 \
-  --batch-size 5 --distance-dim 256 \
-  --dropout 0.0 \
-  --num-convs-id 3 --num-convs-reg 3 --load experiments/run_13/weights.27-*.hdf5
-
-#Pytorch  training
-#singularity exec -B /home --nv $IMG \
-#  python3 test/train_end2end.py \
-#  --dataset /home/joosep/particleflow/data/TTbar_14TeV_TuneCUETP8M1_cfi \
-#  --n_train 400 --n_val 100 \
-#  --model PFNet7 --convlayer gravnet-radius --lr 0.005 \
-#  --hidden_dim 32 --n_epochs 100 \
-#  --l1 1000.0 --l2 100.0 --l3 1000.0 --space_dim 2 --nearest 5 --convlayer2 sgconv \
-#  --target cand --batch_size 1 --activation leaky_relu \
-#  --dropout 0.0 --encoding_dim 256 --optimizer adamw --radius 0.01 --input-encoding 0
diff --git a/mlpf/tfmodel/callbacks.py b/mlpf/tfmodel/callbacks.py
index 6edfddcda..2545a0e7f 100644
--- a/mlpf/tfmodel/callbacks.py
+++ b/mlpf/tfmodel/callbacks.py
@@ -4,7 +4,7 @@
 from tensorflow.keras.callbacks import ModelCheckpoint
 from pathlib import Path
 import numpy as np
-
+import json
 
 class CustomTensorBoard(TensorBoard):
     """
@@ -16,6 +16,9 @@ class CustomTensorBoard(TensorBoard):
 
     Also logs momemtum for supported optimizers that use momemtum.
     """
+    def __init__(self, *args, **kwargs):
+        self.dump_history = kwargs.pop("dump_history")
+        super().__init__(*args, **kwargs)
 
     def _collect_learning_rate(self, logs):
         logs = logs or {}
@@ -40,6 +43,13 @@ def _collect_learning_rate(self, logs):
     def on_epoch_end(self, epoch, logs):
         logs = logs or {}
         logs.update(self._collect_learning_rate(logs))
+        if self.dump_history:
+            history_path = Path(self.log_dir) / "history"
+            history_path.mkdir(parents=True, exist_ok=True)
+            history_path = str(history_path)
+            with open("{}/history_{}.json".format(history_path, epoch), "w") as fi:
+                converted_logs = {k: float(v) for k, v in logs.items()}
+                json.dump(converted_logs, fi)
         super().on_epoch_end(epoch, logs)
 
     def on_train_batch_end(self, batch, logs):
diff --git a/mlpf/tfmodel/data.py b/mlpf/tfmodel/data.py
index 9cdb56dbc..b9e0c0f57 100644
--- a/mlpf/tfmodel/data.py
+++ b/mlpf/tfmodel/data.py
@@ -51,10 +51,14 @@ def __init__(self, **kwargs):
         print("val files: {}".format(len(self.val_filelist)))
 
         self.schema = kwargs.get("schema")
+
+        #FIXME: refactor this
         if self.schema == "delphes":
             self.prepare_data = self.prepare_data_delphes
+            self.get_X_eta_phi_energy = self.get_X_eta_phi_energy_delphes
         elif self.schema == "cms":
             self.prepare_data = self.prepare_data_cms
+            self.get_X_eta_phi_energy = self.get_X_eta_phi_energy_cms
 
 #       NONE = 0,
 #       TRACK = 1,
@@ -233,7 +237,7 @@ def serialize_chunk(self, path, files, ichunk):
         Xs = np.concatenate(Xs)
         ys = np.concatenate(ys)
 
-        #set weights for each sample to be equal to the number of samples of this type
+        #set weights for each sample to be equal to the number of target particles of this type
         #in the training script, this can be used to compute either inverse or class-balanced weights
         uniq_vals, uniq_counts = np.unique(np.concatenate([y[:, 0] for y in ys]), return_counts=True)
         for i in range(len(ys)):
@@ -259,3 +263,17 @@ def process(self, num_files_per_tfr):
         for ichunk, files in enumerate(chunks(self.raw_filelist, num_files_per_tfr)):
             print(files)
             self.serialize_chunk(processed_path, files, ichunk)
+
+    #FIXME: schema 
+    def get_X_eta_phi_energy_delphes(self, X):
+        eta = X[:, :, 2]
+        sphi = X[:, :, 3]
+        cphi = X[:, :, 4]
+        energy = X[:, :, 5]
+        return eta, np.arctan2(sphi, cphi), energy
+
+    def get_X_eta_phi_energy_cms(self, X):
+        eta = X[:, :, 2]
+        phi = X[:, :, 3]
+        energy = X[:, :, 4]
+        return eta, phi, energy
\ No newline at end of file
diff --git a/mlpf/tfmodel/hypertuning.py b/mlpf/tfmodel/hypertuning.py
new file mode 100644
index 000000000..c86c97fc3
--- /dev/null
+++ b/mlpf/tfmodel/hypertuning.py
@@ -0,0 +1,55 @@
+from tensorboard.plugins.hparams import api as hp
+import tensorflow as tf
+import keras_tuner as kt
+
+from tfmodel.model_setup import make_model, FlattenedCategoricalAccuracy
+from tfmodel.model import PFNetDense
+
+from tfmodel.utils import (
+    get_lr_schedule,
+    get_optimizer,
+    load_config,
+    set_config_loss,
+    get_loss_dict,
+    parse_config,
+)
+
+
+def get_model_builder(config, total_steps):
+    _, optim_callbacks = get_lr_schedule(config, steps=total_steps)
+    def model_builder(hp):
+        # config["parameters"]["combined_graph_layer"]["hidden_dim"] = hp.Choice("hidden_dim", values=[128])
+        # config["parameters"]["combined_graph_layer"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
+        # config["parameters"]["combined_graph_layer"]["num_node_messages"] = hp.Choice("num_node_messages", [1, 2])
+        config["parameters"]["num_graph_layers_common"] = hp.Choice("num_graph_layers_common", [2, 3])
+        config["parameters"]["num_graph_layers_energy"] = hp.Choice("num_graph_layers_energy", [2, 3])
+        # config["parameters"]["combined_graph_layer"]["dropout"] = hp.Choice("dropout", values=[0.2])
+        # config["parameters"]["combined_graph_layer"]["bin_size"] = hp.Choice("bin_size", values=[640])
+
+        # config["setup"]["lr"] = hp.Choice("lr", values=[1e-4, 3e-4])
+        # config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"])
+        # config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])
+
+
+        model = make_model(config, dtype="float32")
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
+
+        lr_schedule, _ = get_lr_schedule(config, steps=total_steps)
+        opt = get_optimizer(config, lr_schedule)
+
+        loss_dict, loss_weights = get_loss_dict(config)
+        model.compile(
+            loss=loss_dict,
+            optimizer=opt,
+            sample_weight_mode="temporal",
+            loss_weights=loss_weights,
+            metrics={
+                "cls": [
+                    FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                    FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ]
+            },
+        )
+        return model
+
+    return model_builder, optim_callbacks
diff --git a/mlpf/tfmodel/lr_finder.py b/mlpf/tfmodel/lr_finder.py
index 152b69417..cba366eb6 100644
--- a/mlpf/tfmodel/lr_finder.py
+++ b/mlpf/tfmodel/lr_finder.py
@@ -14,7 +14,7 @@ class LRFinder(Callback):
     paper: https://arxiv.org/pdf/1803.09820.pdf.
     """
 
-    def __init__(self, start_lr: float = 1e-7, end_lr: float = 3, max_steps: int = 200, smoothing=0.9):
+    def __init__(self, start_lr: float = 1e-7, end_lr: float = 1e-2, max_steps: int = 200, smoothing=0.9):
         super(LRFinder, self).__init__()
         self.start_lr, self.end_lr = start_lr, end_lr
         self.max_steps = max_steps
@@ -46,7 +46,7 @@ def on_train_batch_end(self, batch, logs=None):
             if step == 0 or loss < self.best_loss:
                 self.best_loss = loss
 
-            if smooth_loss > 4 * self.best_loss or tf.math.is_nan(smooth_loss):
+            if smooth_loss > 100 * self.best_loss or tf.math.is_nan(smooth_loss):
                 self.model.stop_training = True
                 print("Loss reached predefined maximum... stopping")
         if step >= self.max_steps:
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 9e148a6ef..7bf0db3ae 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -5,12 +5,10 @@
 
 import tensorflow as tf
 
-from .fast_attention import Attention, SelfAttention
-
 import numpy as np
 from numpy.lib.recfunctions import append_fields
 
-regularizer_weight = 1e-8
+regularizer_weight = 0.0
 
 def split_indices_to_bins(cmul, nbins, bin_size):
     bin_idx = tf.argmax(cmul, axis=-1)
@@ -36,6 +34,23 @@ def pairwise_gaussian_dist(A, B):
     D = tf.sqrt(tf.maximum(na - 2*tf.matmul(A, B, False, True) + nb, 1e-6))
     return D
 
+def pairwise_learnable_dist(A, B, ffn, training=False):
+    shp = tf.shape(A)
+
+    #stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
+    mg = tf.meshgrid(tf.range(shp[0]), tf.range(shp[1]), tf.range(shp[2]), tf.range(shp[2]), indexing="ij")
+    inds1 = tf.stack([mg[0],mg[1],mg[2]], axis=-1)
+    inds2 = tf.stack([mg[0],mg[1],mg[3]], axis=-1)
+    res = tf.concat([
+        tf.gather_nd(A, inds1),
+        tf.gather_nd(B, inds2)], axis=-1
+    ) #(batch, bin, elem, elem, feat)
+
+    #run a feedforward net on (src, dst) -> 1
+    res_transformed = ffn(res, training=training)
+
+    return res_transformed
+
 def pairwise_sigmoid_dist(A, B):
     return tf.nn.sigmoid(tf.matmul(A, tf.transpose(B, perm=[0,2,1])))
 
@@ -124,16 +139,29 @@ def __init__(self, num_input_classes):
     @tf.function
     def call(self, X):
 
+        log_energy = tf.expand_dims(tf.math.log(X[:, :, 4]+1.0), axis=-1)
+
         #X[:, :, 0] - categorical index of the element type
         Xid = tf.cast(tf.one_hot(tf.cast(X[:, :, 0], tf.int32), self.num_input_classes), dtype=X.dtype)
         #Xpt = tf.expand_dims(tf.math.log1p(X[:, :, 1]), axis=-1)
         Xpt = tf.expand_dims(tf.math.log(X[:, :, 1] + 1.0), axis=-1)
-        Xeta1 = tf.expand_dims(tf.sinh(X[:, :, 2]), axis=-1)
-        Xeta2 = tf.expand_dims(tf.cosh(X[:, :, 2]), axis=-1)
+
+        Xpt_0p5 = tf.math.sqrt(Xpt)
+        Xpt_2 = tf.math.pow(Xpt, 2)
+
+        Xeta1 = tf.clip_by_value(tf.expand_dims(tf.sinh(X[:, :, 2]), axis=-1), -10, 10)
+        Xeta2 = tf.clip_by_value(tf.expand_dims(tf.cosh(X[:, :, 2]), axis=-1), -10, 10)
+        Xabs_eta = tf.expand_dims(tf.math.abs(X[:, :, 2]), axis=-1)
         Xphi1 = tf.expand_dims(tf.sin(X[:, :, 3]), axis=-1)
         Xphi2 = tf.expand_dims(tf.cos(X[:, :, 3]), axis=-1)
+
         #Xe = tf.expand_dims(tf.math.log1p(X[:, :, 4]), axis=-1)
-        Xe = tf.expand_dims(tf.math.log(X[:, :, 4]+1.0), axis=-1)
+        Xe = log_energy
+        Xe_0p5 = tf.math.sqrt(log_energy)
+        Xe_2 = tf.math.pow(log_energy, 2)
+
+        Xe_transverse = log_energy - tf.math.log(Xeta2)
+
         Xlayer = tf.expand_dims(X[:, :, 5]*10.0, axis=-1)
         Xdepth = tf.expand_dims(X[:, :, 6]*10.0, axis=-1)
 
@@ -143,53 +171,22 @@ def call(self, X):
         Xphi_hcal2 = tf.expand_dims(tf.cos(X[:, :, 12]), axis=-1)
 
         return tf.concat([
-            Xid, Xpt,
+            Xid,
+            Xpt, Xpt_0p5, Xpt_2,
             Xeta1, Xeta2,
+            Xabs_eta,
             Xphi1, Xphi2,
-            Xe, Xlayer, Xdepth,
-            Xphi_ecal1, Xphi_ecal2, Xphi_hcal1, Xphi_hcal2,
+            Xe, Xe_0p5, Xe_2,
+            Xe_transverse,
+            Xlayer, Xdepth,
+            Xphi_ecal1, Xphi_ecal2,
+            Xphi_hcal1, Xphi_hcal2,
             X], axis=-1
         )
 
-#https://arxiv.org/pdf/2004.04635.pdf
-#https://github.com/gcucurull/jax-ghnet/blob/master/models.py 
-class GHConv(tf.keras.layers.Layer):
-    def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
-
-        super(GHConv, self).__init__(*args, **kwargs)
-
-    def build(self, input_shape):
-        self.hidden_dim = input_shape[0][-1]
-        self.nelem = input_shape[0][-2]
-        self.W_t = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.b_t = self.add_weight(shape=(self.hidden_dim,), name="b_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.W_h = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_h", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.theta = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="theta", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
- 
-    #@tf.function
-    def call(self, inputs):
-        x, adj = inputs
-
-        #compute the normalization of the adjacency matrix
-        in_degrees = tf.sparse.reduce_sum(tf.abs(adj), axis=-1)
-        in_degrees = tf.reshape(in_degrees, (tf.shape(x)[0], tf.shape(x)[1]))
-
-        #add epsilon to prevent numerical issues from 1/sqrt(x)
-        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
-
-        f_hom = tf.linalg.matmul(x, self.theta)
-        f_hom = sparse_dense_matmult_batch(adj, f_hom*norm)*norm
-
-        f_het = tf.linalg.matmul(x, self.W_h)
-        gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
-
-        out = gate*f_hom + (1-gate)*f_het
-        return self.activation(out)
-
 class GHConvDense(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
+        self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
         self.output_dim = kwargs.pop("output_dim")
         self.normalize_degrees = kwargs.pop("normalize_degrees", True)
 
@@ -206,6 +203,9 @@ def build(self, input_shape):
     #@tf.function
     def call(self, inputs):
         x, adj, msk = inputs
+
+        adj = tf.squeeze(adj)
+        
         #compute the normalization of the adjacency matrix
         if self.normalize_degrees:
             in_degrees = tf.clip_by_value(tf.reduce_sum(tf.abs(adj), axis=-1), 0, 1000)
@@ -225,190 +225,137 @@ def call(self, inputs):
         out = gate*f_hom + (1.0-gate)*f_het
         return self.activation(out)*msk
 
-class SGConv(tf.keras.layers.Layer):
+class NodeMessageLearnable(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
-        self.k = kwargs.pop("k")
-        super(SGConv, self).__init__(*args, **kwargs)
-    
-    def build(self, input_shape):
-        hidden_dim = input_shape[0][-1]
-        self.W = self.add_weight(shape=(hidden_dim, hidden_dim), name="w", initializer="random_normal", trainable=True)
-        self.b = self.add_weight(shape=(hidden_dim,), name="b", initializer="random_normal", trainable=True)
 
-    #@tf.function
-    def call(self, inputs):
-        x, adj = inputs
-        #compute the normalization of the adjacency matrix
-        in_degrees = tf.sparse.reduce_sum(tf.abs(adj), axis=-1)
-
-        #add epsilon to prevent numerical issues from 1/sqrt(x)
-        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
-        norm_k = tf.pow(norm, self.k)
-
-        support = tf.linalg.matmul(x, self.W)
-     
-        #k-th power of the normalized adjacency matrix is nearly equivalent to k consecutive GCN layers
-        #adj_k = tf.pow(adj, self.k)
-        out = sparse_dense_matmult_batch(adj, support*norm)*norm
-
-        return self.activation(out + self.b)
-
-def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None):
-    bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
-    kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
-    return tf.keras.Sequential(
-        [tf.keras.layers.Dense(dff, activation=activation, bias_regularizer=bias_regularizer, kernel_regularizer=kernel_regularizer) for i in range(num_layers)] +
-        [tf.keras.layers.Dense(d_model, dtype=dtype)],
-        name=name
-    )
-
-class SparseHashedNNDistance(tf.keras.layers.Layer):
-    def __init__(self, distance_dim=128, max_num_bins=200, bin_size=500, num_neighbors=5, dist_mult=0.1, **kwargs):
-        super(SparseHashedNNDistance, self).__init__(**kwargs)
-        self.num_neighbors = tf.constant(num_neighbors)
-        self.dist_mult = dist_mult
-        self.distance_dim = distance_dim
-
-        #generate the codebook for LSH hashing at model instantiation for up to this many bins
-        #set this to a high-enough value at model generation to take into account the largest possible input 
-        self.max_num_bins = tf.constant(max_num_bins)
-
-        #each bin will receive this many input elements, in total we can accept max_num_bins*bin_size input elements
-        #in each bin, we will do a dense top_k evaluation
-        self.bin_size = bin_size
-        self.layer_encoding = point_wise_feed_forward_network(distance_dim, 128)
-        self.layer_edge = point_wise_feed_forward_network(1, 128)
-
-    def build(self, input_shape):
-        #(n_batch, n_points, n_features)
-
-        #generate the LSH codebook for random rotations (num_features, max_num_bins/2)
-        self.codebook_random_rotations = self.add_weight(
-            shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal", trainable=False, name="lsh_projections"
-        )
-
-    #@tf.function
-    def call(self, inputs, training=True):
-
-        #(n_batch, n_points, n_features)
-        point_embedding = self.layer_encoding(inputs)
-        
-        n_batches = tf.shape(point_embedding)[0]
-        n_points = tf.shape(point_embedding)[1]
-        #points_neighbors = n_points * self.num_neighbors
-
-        #cannot concat sparse tensors directly as that incorrectly destroys the gradient, see
-        #https://github.com/tensorflow/tensorflow/blob/df3a3375941b9e920667acfe72fb4c33a8f45503/tensorflow/python/ops/sparse_grad.py#L33
-        def func(args):
-            ibatch, points_batch = args[0], args[1]
-            bins_split, (inds, vals) = self.construct_sparse_dm_batch(points_batch)
-            inds = tf.concat([tf.expand_dims(tf.cast(ibatch, tf.int64)*tf.ones(tf.shape(inds)[0], dtype=tf.int64), -1), inds], axis=-1)
-            return inds, vals, bins_split
-
-        elems = (tf.range(0, n_batches, delta=1, dtype=tf.int64), point_embedding)
-        ret = tf.map_fn(func, elems,
-            fn_output_signature=(
-                tf.TensorSpec((None, 3), tf.int64),
-                tf.TensorSpec((None, ), inputs.dtype),
-                tf.TensorSpec((None, self.bin_size), tf.int32),
-            ),
-            parallel_iterations=2, back_prop=True
-        )
-
-        # #now create a new SparseTensor that is a concatenation of the per-batch tensor indices and values
-        shp = tf.shape(ret[0])
-        dms = tf.SparseTensor(
-            tf.reshape(ret[0], (shp[0]*shp[1], shp[2])),
-            tf.reshape(ret[1], (shp[0]*shp[1],)),
-            (n_batches, n_points, n_points)
+        self.output_dim = kwargs.pop("output_dim")
+        self.hidden_dim = kwargs.pop("hidden_dim")
+        self.num_layers = kwargs.pop("num_layers")
+        self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
+        self.aggregation_direction = kwargs.pop("aggregation_direction")
+
+        if self.aggregation_direction == "dst":
+            self.agg_dim = -2
+        elif self.aggregation_direction == "src":
+            self.agg_dim = -3
+
+        self.ffn = point_wise_feed_forward_network(
+            self.output_dim,
+            self.hidden_dim,
+            num_layers=self.num_layers,
+            activation=self.activation,
+            name=kwargs.get("name")+"_ffn"
         )
+        super(NodeMessageLearnable, self).__init__(*args, **kwargs)
 
-        dm = tf.sparse.reorder(dms)
-
-        i1 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 1]]))
-        i2 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 2]]))
-        x1 = tf.gather_nd(inputs, i1)
-        x2 = tf.gather_nd(inputs, i2)
-
-        #run an edge net on (src node, dst node, edge)
-        edge_vals = tf.nn.sigmoid(self.layer_edge(tf.concat([x1, x2, tf.expand_dims(dm.values, axis=-1)], axis=-1)))
-        dm2 = tf.sparse.SparseTensor(indices=dm.indices, values=edge_vals[:, 0], dense_shape=dm.dense_shape)
-
-        return dm2, ret[2]
+    def call(self, inputs):
+        x, adj, msk = inputs
+        avg_message = tf.reduce_mean(adj, axis=self.agg_dim)
+        max_message = tf.reduce_max(adj, axis=self.agg_dim)
+        x2 = tf.concat([x, avg_message, max_message], axis=-1)*msk
+        return self.activation(self.ffn(x2))
+
+def point_wise_feed_forward_network(d_model, dff, name, num_layers=1, activation='elu', dtype=tf.dtypes.float32, dim_decrease=False, dropout=0.0):
+
+    if regularizer_weight > 0:
+        bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
+        kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
+    else:
+        bias_regularizer = None
+        kernel_regularizer = None
+
+    layers = []
+    for ilayer in range(num_layers):
+        _name = name + "_dense_{}".format(ilayer)
+
+        layers.append(tf.keras.layers.Dense(
+            dff, activation=activation, bias_regularizer=bias_regularizer,
+            kernel_regularizer=kernel_regularizer, name=_name))
+
+        if dropout>0.0:
+            layers.append(tf.keras.layers.Dropout(dropout))
+
+        if dim_decrease:
+            dff = dff // 2
+
+    layers.append(tf.keras.layers.Dense(d_model, dtype=dtype, name="{}_dense_{}".format(name, ilayer+1)))
+    return tf.keras.Sequential(layers, name=name)
+
+def get_message_layer(config_dict, name):
+    config_dict = config_dict.copy()
+    class_name = config_dict.pop("type")
+    classes = {
+        "NodeMessageLearnable": NodeMessageLearnable,
+        "GHConvDense": GHConvDense
+    }
+    conv_cls = classes[class_name]
+
+    return conv_cls(name=name, **config_dict)
+
+class NodePairGaussianKernel(tf.keras.layers.Layer):
+    def __init__(self, clip_value_low=0.0, dist_mult=0.1, **kwargs):
+        self.clip_value_low = clip_value_low
+        self.dist_mult = dist_mult
+        super(NodePairGaussianKernel, self).__init__(**kwargs)
 
-    #@tf.function
-    def subpoints_to_sparse_matrix(self, subindices, subpoints):
+    """
+    x_msg_binned: (n_batch, n_bins, n_points, n_msg_features)
 
-        #find the distance matrix between the given points in all the LSH bins
-        dm = pairwise_gaussian_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
+    returns: (n_batch, n_bins, n_points, n_points, 1) message matrix
+    """
+    def call(self, x_msg_binned, training=False):
+        dm = tf.expand_dims(pairwise_gaussian_dist(x_msg_binned, x_msg_binned), axis=-1)
         dm = tf.exp(-self.dist_mult*dm)
+        dm = tf.clip_by_value(dm, self.clip_value_low, 1)
+        return dm
 
-        #dm = pairwise_sigmoid_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
-
-        dmshape = tf.shape(dm)
-        nbins = dmshape[0]
-        nelems = dmshape[1]
-
-        #run KNN in the dense distance matrix, accumulate each index pair into a sparse distance matrix
-        top_k = tf.nn.top_k(dm, k=self.num_neighbors)
-        top_k_vals = tf.reshape(top_k.values, (nbins*nelems, self.num_neighbors))
-
-        indices_gathered = tf.map_fn(
-            lambda i: tf.gather_nd(subindices, top_k.indices[:, :, i:i+1], batch_dims=1),
-            tf.range(self.num_neighbors, dtype=tf.int32), fn_output_signature=tf.TensorSpec(None, tf.int32)
+class NodePairTrainableKernel(tf.keras.layers.Layer):
+    def __init__(self, output_dim=32, hidden_dim=32, num_layers=2, activation="elu", **kwargs):
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.activation = getattr(tf.keras.activations, activation)
+
+        self.ffn_kernel = point_wise_feed_forward_network(
+            self.output_dim,
+            self.hidden_dim,
+            kwargs.get("name") + "_" + "ffn",
+            num_layers=self.num_layers,
+            activation=self.activation
         )
-        indices_gathered = tf.transpose(indices_gathered, [1,2,0])
-
-        def func(i):
-           dst_ind = indices_gathered[:, :, i] #(nbins, nelems)
-           dst_ind = tf.reshape(dst_ind, (nbins*nelems, ))
-           src_ind = tf.reshape(tf.stack(subindices), (nbins*nelems, ))
-           src_dst_inds = tf.cast(tf.transpose(tf.stack([src_ind, dst_ind])), dtype=tf.int64)
-           return src_dst_inds, top_k_vals[:, i]
-
-        ret = tf.map_fn(func, tf.range(0, self.num_neighbors, delta=1, dtype=tf.int32), fn_output_signature=(tf.int64, subpoints.dtype))
-        
-        shp = tf.shape(ret[0])
-        inds = tf.reshape(ret[0], (shp[0]*shp[1], 2))
-        vals = tf.reshape(ret[1], (shp[0]*shp[1],))
-        return inds, vals
 
-    def construct_sparse_dm_batch(self, points):
-        #points: (n_points, n_features) input elements for graph construction
-        n_points = tf.shape(points)[0]
-        n_features = tf.shape(points)[1]
-
-        #compute the number of LSH bins to divide the input points into on the fly
-        #n_points must be divisible by bin_size exactly due to the use of reshape
-        n_bins = tf.math.floordiv(n_points, self.bin_size)
+        super(NodePairTrainableKernel, self).__init__(**kwargs)
 
-        #put each input item into a bin defined by the softmax output across the LSH embedding
-        mul = tf.linalg.matmul(points, self.codebook_random_rotations[:, :n_bins//2])
-        cmul = tf.concat([mul, -mul], axis=-1)
+    """
+    x_msg_binned: (n_batch, n_bins, n_points, n_msg_features)
 
-        #cmul is now an integer in [0..nbins) for each input point
-        #bins_split: (n_bins, bin_size) of integer bin indices, which puts each input point into a bin of size (n_points/n_bins)
-        bins_split = split_indices_to_bins(cmul, n_bins, self.bin_size)
+    returns: (n_batch, n_bins, n_points, n_points, output_dim) message matrix
+    """
+    def call(self, x_msg_binned, training=False):
+        dm = pairwise_learnable_dist(x_msg_binned, x_msg_binned, self.ffn_kernel, training=training)
+        dm = self.activation(dm)
+        return dm
 
-        #parts: (n_bins, bin_size, n_features), the input points divided up into bins
-        parts = tf.gather(points, bins_split)
+def build_kernel_from_conf(kernel_dict, name):
+    kernel_dict = kernel_dict.copy()
 
-        #sparse_distance_matrix: (n_points, n_points) sparse distance matrix
-        #where higher values (closer to 1) are associated with points that are closely related
-        sparse_distance_matrix = self.subpoints_to_sparse_matrix(bins_split, parts)
+    cls_type = kernel_dict.pop("type")
+    clss = {
+        "NodePairGaussianKernel": NodePairGaussianKernel,
+        "NodePairTrainableKernel": NodePairTrainableKernel
+    }
 
-        return bins_split, sparse_distance_matrix
+    return clss[cls_type](name=name, **kernel_dict)
 
-class ExponentialLSHDistanceDense(tf.keras.layers.Layer):
-    def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_size=128, dist_mult=0.1, **kwargs):
-        super(ExponentialLSHDistanceDense, self).__init__(**kwargs)
-        self.dist_mult = dist_mult
+class MessageBuildingLayerLSH(tf.keras.layers.Layer):
+    def __init__(self, distance_dim=128, max_num_bins=200, bin_size=128, kernel=NodePairGaussianKernel(), **kwargs):
         self.distance_dim = distance_dim
         self.max_num_bins = max_num_bins
         self.bin_size = bin_size
-        self.clip_value_low = clip_value_low
-        
+        self.kernel = kernel
+
+        super(MessageBuildingLayerLSH, self).__init__(**kwargs)
+
     def build(self, input_shape):
         #(n_batch, n_points, n_features)
     
@@ -417,461 +364,331 @@ def build(self, input_shape):
             shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal",
             trainable=False, name="lsh_projections"
         )
-        
-    def call(self, x_dist, x_features, msk):
-        msk_f = tf.expand_dims(tf.cast(msk, x_dist.dtype), -1)
-        n_batches = tf.shape(x_dist)[0]
-        n_points = tf.shape(x_dist)[1]
-        n_features = tf.shape(x_dist)[2]
+    
+    """
+    x_msg: (n_batch, n_points, n_msg_features)
+    x_node: (n_batch, n_points, n_node_features)
+    """
+    def call(self, x_msg, x_node, msk, training=False):
+        msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
+
+        shp = tf.shape(x_msg)
+        n_batches = shp[0]
+        n_points = shp[1]
+        n_message_features = shp[2]
 
         #compute the number of LSH bins to divide the input points into on the fly
         #n_points must be divisible by bin_size exactly due to the use of reshape
         n_bins = tf.math.floordiv(n_points, self.bin_size)
 
         #put each input item into a bin defined by the argmax output across the LSH embedding
-        mul = tf.linalg.matmul(x_dist, self.codebook_random_rotations[:, :n_bins//2])
+        #FIXME: this needs n_bins to be at least 2 to work correctly!
+        mul = tf.linalg.matmul(x_msg, self.codebook_random_rotations[:, :n_bins//2])
         cmul = tf.concat([mul, -mul], axis=-1)
         bins_split = split_indices_to_bins_batch(cmul, n_bins, self.bin_size, msk)
-        x_dist_binned = tf.gather(x_dist, bins_split, batch_dims=1)
-        x_features_binned = tf.gather(x_features, bins_split, batch_dims=1)
+        x_msg_binned = tf.gather(x_msg, bins_split, batch_dims=1)
+        x_features_binned = tf.gather(x_node, bins_split, batch_dims=1)
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
-        dm = pairwise_gaussian_dist(x_dist_binned, x_dist_binned)
-        dm = tf.exp(-self.dist_mult*dm)
-        
-        #set the distance matrix to 0 for masked elements
-        dm *= msk_f_binned
-        shp = tf.shape(msk_f_binned)
-        dm *= tf.reshape(msk_f_binned, (shp[0], shp[1], shp[3], shp[2]))
+        #Run the node-to-node kernel (distance computation / graph building / attention)
+        dm = self.kernel(x_msg_binned, training=training)
 
-        dm = tf.clip_by_value(dm, self.clip_value_low, 1)
+        #remove the masked points row-wise and column-wise
+        dm = tf.einsum("abijk,abi->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
+        dm = tf.einsum("abijk,abj->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
 
         return bins_split, x_features_binned, dm, msk_f_binned
 
-class EncoderDecoderGNN(tf.keras.layers.Layer):
-    def __init__(self, encoders, decoders, dropout, activation, conv, **kwargs):
-        super(EncoderDecoderGNN, self).__init__(**kwargs)
-        name = kwargs.get("name")
-
-        #assert(encoders[-1] == decoders[0])
-        self.encoders = encoders
-        self.decoders = decoders
-
-        self.encoding_layers = []
-        for ilayer, nunits in enumerate(encoders):
-            self.encoding_layers.append(
-                tf.keras.layers.Dense(nunits, activation=activation,
-                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    name="encoding_{}_{}".format(name, ilayer)))
-            if dropout > 0.0:
-                self.encoding_layers.append(tf.keras.layers.Dropout(dropout))
-
-        self.conv = conv
-
-        self.decoding_layers = []
-        for ilayer, nunits in enumerate(decoders):
-            self.decoding_layers.append(
-                tf.keras.layers.Dense(nunits, activation=activation,
-                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    name="decoding_{}_{}".format(name, ilayer)))
-            if dropout > 0.0:
-                self.decoding_layers.append(tf.keras.layers.Dropout(dropout))
-
-    @tf.function
-    def call(self, inputs, distance_matrix, training=True):
-        x = inputs
-
-        for layer in self.encoding_layers:
-            x = layer(x)
+class MessageBuildingLayerFull(tf.keras.layers.Layer):
+    def __init__(self, distance_dim=128, kernel=NodePairGaussianKernel(), **kwargs):
+        self.distance_dim = distance_dim
+        self.kernel = kernel
 
-        for convlayer in self.conv:
-            x = convlayer([x, distance_matrix])
+        super(MessageBuildingLayerFull, self).__init__(**kwargs)
+    
+    """
+    x_msg: (n_batch, n_points, n_msg_features)
+    """
+    def call(self, x_msg, msk, training=False):
+        msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
 
-        for layer in self.decoding_layers:
-            x = layer(x)
+        shp = tf.shape(x_msg)
+        n_batches = shp[0]
+        n_points = shp[1]
+        n_message_features = shp[2]
 
-        return x
+        #Run the node-to-node kernel (distance computation / graph building / attention)
+        dm = self.kernel(x_msg, training=training)
 
-class AddSparse(tf.keras.layers.Layer):
-    def __init__(self, **kwargs):
-        super(AddSparse, self).__init__(**kwargs)
+        #remove the masked points row-wise and column-wise
+        dm = tf.einsum("bijk,bi->bijk", dm, tf.squeeze(msk_f, axis=-1))
+        dm = tf.einsum("bijk,bj->bijk", dm, tf.squeeze(msk_f, axis=-1))
 
-    def call(self, matrices):
-        ret = matrices[0]
-        for mat in matrices[1:]:
-            ret = tf.sparse.add(ret, mat)
-        return ret
+        return dm
 
-#Simple message passing based on a matrix multiplication
-class PFNet(tf.keras.Model):
+class OutputDecoding(tf.keras.Model):
     def __init__(self,
-        multi_output=False,
-        num_input_classes=8,
-        num_output_classes=3,
-        num_momentum_outputs=3,
-        activation=tf.nn.selu,
-        hidden_dim_id=256,
-        hidden_dim_reg=256,
-        distance_dim=256,
-        convlayer="ghconv",
-        dropout=0.1,
-        bin_size=10,
-        num_convs_id=1,
-        num_convs_reg=1,
-        num_hidden_id_enc=1,
-        num_hidden_id_dec=1,
-        num_hidden_reg_enc=1,
-        num_hidden_reg_dec=1,
-        num_neighbors=5,
-        dist_mult=0.1,
-        skip_connection=False,
-        return_matrix=False):
-
-        super(PFNet, self).__init__()
-        self.activation = activation
-        self.num_dists = 1
-        self.num_momentum_outputs = num_momentum_outputs
-        self.skip_connection = skip_connection
-        self.multi_output = multi_output
-        self.return_matrix = return_matrix
+        activation="elu",
+        regression_use_classification=True,
+        num_output_classes=8,
+        schema="cms",
+        dropout=0.0,
+
+        pt_skip_gate=True,
+        eta_skip_gate=True,
+        phi_skip_gate=True,
+        energy_skip_gate=True,
+
+        id_dim_decrease=True,
+        charge_dim_decrease=True,
+        pt_dim_decrease=False,
+        eta_dim_decrease=False,
+        phi_dim_decrease=False,
+        energy_dim_decrease=False,
+
+        id_hidden_dim=128,
+        charge_hidden_dim=128,
+        pt_hidden_dim=128,
+        eta_hidden_dim=128,
+        phi_hidden_dim=128,
+        energy_hidden_dim=128,
+
+        id_num_layers=4,
+        charge_num_layers=2,
+        pt_num_layers=3,
+        eta_num_layers=3,
+        phi_num_layers=3,
+        energy_num_layers=3,
+
+        layernorm=False,
+        mask_reg_cls0=True,
+        **kwargs):
+
+        super(OutputDecoding, self).__init__(**kwargs)
+
+        self.regression_use_classification = regression_use_classification
+        self.schema = schema
+        self.dropout = dropout
+
+        self.pt_skip_gate = pt_skip_gate
+        self.eta_skip_gate = eta_skip_gate
+        self.phi_skip_gate = phi_skip_gate
+
+        self.mask_reg_cls0 = mask_reg_cls0
+
+        self.do_layernorm = layernorm
+        if self.do_layernorm:
+            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, name="output_layernorm")
+
+        self.ffn_id = point_wise_feed_forward_network(
+            num_output_classes, id_hidden_dim,
+            "ffn_cls",
+            dtype=tf.dtypes.float32,
+            num_layers=id_num_layers,
+            activation=activation,
+            dim_decrease=id_dim_decrease,
+            dropout=dropout
+        )
+        self.ffn_charge = point_wise_feed_forward_network(
+            1, charge_hidden_dim,
+            "ffn_charge",
+            dtype=tf.dtypes.float32,
+            num_layers=charge_num_layers,
+            activation=activation,
+            dim_decrease=charge_dim_decrease,
+            dropout=dropout
+        )
+        
+        self.ffn_pt = point_wise_feed_forward_network(
+            2, pt_hidden_dim, "ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=pt_num_layers, activation=activation, dim_decrease=pt_dim_decrease,
+            dropout=dropout
+        )
 
-        encoding_id = []
-        decoding_id = []
-        encoding_reg = []
-        decoding_reg = []
+        self.ffn_eta = point_wise_feed_forward_network(
+            2, eta_hidden_dim, "ffn_eta",
+            dtype=tf.dtypes.float32, num_layers=eta_num_layers, activation=activation, dim_decrease=eta_dim_decrease,
+            dropout=dropout
+        )
 
-        #the encoder outputs and decoder inputs have to have the hidden dim (convlayer size)
-        for ihidden in range(num_hidden_id_enc):
-            encoding_id.append(hidden_dim_id)
+        self.ffn_phi = point_wise_feed_forward_network(
+            4, phi_hidden_dim, "ffn_phi",
+            dtype=tf.dtypes.float32, num_layers=phi_num_layers, activation=activation, dim_decrease=phi_dim_decrease,
+            dropout=dropout
+        )
 
-        for ihidden in range(num_hidden_id_dec):
-            decoding_id.append(hidden_dim_id)
+        self.ffn_energy = point_wise_feed_forward_network(
+            num_output_classes, energy_hidden_dim, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
+            dropout=dropout)
 
-        for ihidden in range(num_hidden_reg_enc):
-            encoding_reg.append(hidden_dim_reg)
+        self.ffn_energy_classwise = point_wise_feed_forward_network(
+            1, energy_hidden_dim, "ffn_energy_classwise_shift",
+            dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
+            dropout=dropout)
 
-        for ihidden in range(num_hidden_reg_dec):
-            decoding_reg.append(hidden_dim_reg)
+    """
+    X_input: (n_batch, n_elements, n_input_features) raw node input features
+    X_encoded: (n_batch, n_elements, n_encoded_features) encoded/transformed node features
+    msk_input: (n_batch, n_elements) boolean mask of active nodes
+    """
+    def call(self, args, training=False):
 
-        self.enc = InputEncoding(num_input_classes)
-        #self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dist = SparseHashedNNDistance(distance_dim=distance_dim, bin_size=bin_size, num_neighbors=num_neighbors, dist_mult=dist_mult)
-
-        convs_id = []
-        convs_reg = []
-        if convlayer == "sgconv":
-            for iconv in range(num_convs_id):
-                convs_id.append(SGConv(k=1, activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_convs_reg):
-                convs_reg.append(SGConv(k=1, activation=activation, name="conv_reg{}".format(iconv)))
-        elif convlayer == "ghconv":
-            for iconv in range(num_convs_id):
-                convs_id.append(GHConv(activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_convs_reg):
-                convs_reg.append(GHConv(activation=activation, name="conv_reg{}".format(iconv)))
-
-        self.gnn_id = EncoderDecoderGNN(encoding_id, decoding_id, dropout, activation, convs_id, name="gnn_id")
-        self.layer_id = point_wise_feed_forward_network(num_output_classes, hidden_dim_id, num_layers=3, activation=activation)
-        self.layer_charge = point_wise_feed_forward_network(1, hidden_dim_id, num_layers=3, activation=activation)
-        
-        self.gnn_reg = EncoderDecoderGNN(encoding_reg, decoding_reg, dropout, activation, convs_reg, name="gnn_reg")
-        self.layer_momentum = point_wise_feed_forward_network(num_momentum_outputs, hidden_dim_reg, num_layers=3, activation=activation)
+        X_input, X_encoded, X_encoded_energy, msk_input = args
 
-    # def create_model(self, num_max_elems, num_input_features, training=True):
-    #     inputs = tf.keras.Input(shape=(num_max_elems, num_input_features,))
-    #     return tf.keras.Model(inputs=[inputs], outputs=self.call(inputs, training), name="MLPFNet")
+        if self.do_layernorm:
+            X_encoded = self.layernorm(X_encoded)
 
-    def call(self, inputs, training=True):
-        X = inputs
-        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, tf.dtypes.float32), -1)
+        out_id_logits = self.ffn_id(X_encoded, training=training)*msk_input
 
-        enc = self.enc(inputs)
+        out_id_softmax = tf.nn.softmax(out_id_logits, axis=-1)
+        out_id_hard_softmax = tf.stop_gradient(tf.nn.softmax(100*out_id_logits, axis=-1))
+        out_charge = self.ffn_charge(X_encoded, training=training)*msk_input
 
-        #create a graph structure from the encoded nodes
-        dm, bins = self.dist(enc, training)
+        orig_eta = X_input[:, :, 2:3]
 
-        #run graph net for multiclass id prediction
-        x_id = self.gnn_id(enc, dm, training)
-        
-        if self.skip_connection:
-            to_decode = tf.concat([enc, x_id], axis=-1)
-        else:
-            to_decode = tf.concat([x_id], axis=-1)
+        #FIXME: better schema propagation 
+        #skip connection from raw input values
+        if self.schema == "cms":
+            orig_sin_phi = tf.math.sin(X_input[:, :, 3:4])*msk_input
+            orig_cos_phi = tf.math.cos(X_input[:, :, 3:4])*msk_input
+            orig_energy = X_input[:, :, 4:5]*msk_input
+        elif self.schema == "delphes":
+            orig_sin_phi = X_input[:, :, 3:4]*msk_input
+            orig_cos_phi = X_input[:, :, 4:5]*msk_input
+            orig_energy = X_input[:, :, 5:6]*msk_input
 
-        out_id_logits = self.layer_id(to_decode)*msk_input
-        out_charge = self.layer_charge(to_decode)*msk_input
+        if self.regression_use_classification:
+            X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_logits)], axis=-1)
 
-        #run graph net for regression output prediction, taking as an additonal input the ID predictions
-        x_reg = self.gnn_reg(tf.concat([enc, tf.cast(out_id_logits, X.dtype)], axis=-1), dm, training)
+        pred_eta_corr = self.ffn_eta(X_encoded, training=training)*msk_input
+        pred_phi_corr = self.ffn_phi(X_encoded, training=training)*msk_input
 
-        if self.skip_connection:
-            to_decode = tf.concat([enc, tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
+        if self.eta_skip_gate:
+            eta_gate = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
+            pred_eta = orig_eta + pred_eta_corr[:, :, 1:2]
         else:
-            to_decode = tf.concat([tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
-
-        pred_momentum = self.layer_momentum(to_decode)*msk_input
-
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
-
-        if self.multi_output:
-            ret = {
-                "cls": out_id_softmax,
-                "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5]
-            }
-            if self.return_matrix:
-                ret["dm"] = dm
-                ret["bins"] = bins
-            return ret
+            pred_eta = orig_eta*pred_eta_corr[:, :, 0:1] + pred_eta_corr[:, :, 1:2]
+        
+        if self.phi_skip_gate:
+            sin_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
+            cos_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
+            pred_sin_phi = orig_sin_phi + pred_phi_corr[:, :, 1:2]
+            pred_cos_phi = orig_cos_phi + pred_phi_corr[:, :, 3:4]
         else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
-
-    def set_trainable_classification(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.gnn_id.trainable = True
-        self.layer_id.trainable = True
-
-    def set_trainable_regression(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.gnn_reg.trainable = True
-        self.layer_momentum.trainable = True
-
-
-
-#Transformer code from the TF example
-class EncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1, support=8, dtype=tf.dtypes.float32):
-        super(EncoderLayer, self).__init__()
-
-        self.mha = SelfAttention(d_model, num_heads, rate, projection_matrix_type=True, nb_random_features=support)
-        self.ffn = point_wise_feed_forward_network(d_model, dff)
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, training):
-
-        attn_output = self.mha(x, None, training=training)    # (batch_size, input_seq_len, d_model)
-        attn_output = self.dropout1(attn_output, training=training)
-        out1 = self.layernorm1(x + attn_output)    # (batch_size, input_seq_len, d_model)
-
-        ffn_output = self.ffn(out1)    # (batch_size, input_seq_len, d_model)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        out2 = self.layernorm2(out1 + ffn_output)    # (batch_size, input_seq_len, d_model)
-
-        return out2
-
-class DecoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1, support=8, dtype=tf.dtypes.float32):
-        super(DecoderLayer, self).__init__()
-
-        self.mha1 = SelfAttention(d_model, num_heads, rate, projection_matrix_type=True, nb_random_features=support)
-        self.mha2 = Attention(d_model, num_heads, rate, projection_matrix_type=True, nb_random_features=support)
-        self.ffn = point_wise_feed_forward_network(d_model, dff)
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-        self.dropout3 = tf.keras.layers.Dropout(rate)
-
-
-    def call(self, x, enc_output, training):
-        # enc_output.shape == (batch_size, input_seq_len, d_model)
-
-        attn1 = self.mha1(x, None, training=training)    # (batch_size, target_seq_len, d_model)
-        attn1 = self.dropout1(attn1, training=training)
-        out1 = self.layernorm1(attn1 + x)
-
-        attn2 = self.mha2(enc_output, out1, None, training=training)    # (batch_size, target_seq_len, d_model)
-        attn2 = self.dropout2(attn2, training=training)
-        out2 = self.layernorm2(attn2 + out1)    # (batch_size, target_seq_len, d_model)
-
-        ffn_output = self.ffn(out2)    # (batch_size, target_seq_len, d_model)
-        ffn_output = self.dropout3(ffn_output, training=training)
-        out3 = self.layernorm3(ffn_output + out2)    # (batch_size, target_seq_len, d_model)
-
-        return out3
-
-class Encoder(tf.keras.layers.Layer):
-    def __init__(self, num_layers, d_model, num_heads, dff, support=32, rate=0.1, dtype=tf.dtypes.float32):
-        super(Encoder, self).__init__()
-
-        self.d_model = d_model
-        self.num_layers = num_layers
-
-        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate, support=support, dtype=dtype) 
-                                             for _ in range(num_layers)]
-
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, training):
-
-        for i in range(self.num_layers):
-            x = self.enc_layers[i](x, training)
-
-        x = self.dropout(x, training=training)
-        return x    # (batch_size, input_seq_len, d_model)
-
-class Decoder(tf.keras.layers.Layer):
-    def __init__(self, num_layers, d_model, num_heads, dff, support=32, rate=0.1, dtype=tf.dtypes.float32):
-        super(Decoder, self).__init__()
-
-        self.d_model = d_model
-        self.num_layers = num_layers
-
-        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate, support=support, dtype=dtype) 
-                                             for _ in range(num_layers)]
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, enc_output, training):
-
-        for i in range(self.num_layers):
-            x = self.dec_layers[i](x, enc_output, training)
-
-        x = self.dropout(x, training=training)
-
-        # x.shape == (batch_size, target_seq_len, d_model)
-        return x
-
-
-
-class Transformer(tf.keras.Model):
-    def __init__(self,
-                num_layers, d_model, num_heads, dff,
-                dropout=0.1,
-                support=32,
-                num_input_classes=8,
-                num_output_classes=3,
-                num_momentum_outputs=3,
-                dtype=tf.dtypes.float32,
-                skip_connection=False,
-                multi_output=False):
-        super(Transformer, self).__init__()
-
-        self.skip_connection = skip_connection
-        self.multi_output = multi_output
-        self.num_momentum_outputs = num_momentum_outputs
-
-        self.enc = InputEncoding(num_input_classes)
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.ffn_embed_id = point_wise_feed_forward_network(d_model, dff)
-        self.ffn_embed_reg = point_wise_feed_forward_network(d_model, dff)
-
-        self.encoder_id = Encoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-        self.decoder_id = Decoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-
-        self.encoder_reg = Encoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-        self.decoder_reg = Decoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-
-        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, dtype=tf.dtypes.float32)
-        self.ffn_charge = point_wise_feed_forward_network(1, dff, dtype=tf.dtypes.float32)
-        self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, dtype=tf.dtypes.float32)
+            pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + pred_phi_corr[:, :, 1:2]
+            pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + pred_phi_corr[:, :, 3:4]
 
-    def call(self, inputs, training):
-        X = inputs
-        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, inputs.dtype), -1)
-
-        enc = self.enc(X)
-        enc = self.layernorm1(enc)
+        X_encoded_energy = tf.concat([X_encoded, X_encoded_energy], axis=-1)
+        if self.regression_use_classification:
+            X_encoded_energy = tf.concat([X_encoded_energy, tf.stop_gradient(out_id_logits)], axis=-1)
 
-        enc_id = self.ffn_embed_id(enc)
-        enc_reg = self.ffn_embed_reg(enc)
+        pred_energy_corr = self.ffn_energy(X_encoded_energy, training=training)*msk_input
+        pred_energy = tf.reduce_sum(out_id_hard_softmax*pred_energy_corr, axis=-1, keepdims=True)
 
-        enc_output_id = self.encoder_id(enc_id, training)
-        enc_output_id = self.layernorm2(enc_output_id)
-        dec_output_id = self.decoder_id(enc_id, enc_output_id, training)
+        pred_energy += tf.reduce_sum(
+            out_id_hard_softmax*self.ffn_energy_classwise(
+                tf.concat([orig_energy, tf.stop_gradient(out_id_logits)], axis=-1), training=training),
+            axis=-1, keepdims=True)
 
-        if self.skip_connection:
-            dec_output_id = tf.concat([enc_id, dec_output_id], axis=-1)
+        pred_energy = tf.math.exp(tf.clip_by_value(pred_energy, -3, 8))
 
-        enc_output_reg = self.encoder_reg(enc_reg, training)
-        enc_output_reg = self.layernorm3(enc_output_reg)
-        dec_output_reg = self.decoder_reg(enc_reg, enc_output_reg, training)
+        #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
+        #pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
 
-        out_id_logits = self.ffn_id(dec_output_id)*msk_input
-        out_charge = self.ffn_charge(dec_output_id)*msk_input
+        #compute pt=E/cosh(eta)
+        orig_pt = tf.stop_gradient(pred_energy/tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
+        #orig_log_pt = tf.math.log(orig_pt + 1.0)
 
-        if self.skip_connection:
-            dec_output_reg = tf.concat([enc_reg, tf.cast(out_id_logits, X.dtype), dec_output_reg], axis=-1)
+        pred_pt_corr = self.ffn_pt(X_encoded_energy, training=training)*msk_input
+        if self.pt_skip_gate:
+            pt_gate = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+            pred_pt = orig_pt + pt_gate*pred_pt_corr[:, :, 1:2]
         else:
-            dec_output_reg = tf.concat([tf.cast(out_id_logits, X.dtype), dec_output_reg], axis=-1)
-        pred_momentum = self.ffn_momentum(dec_output_reg)*msk_input
+            pred_pt = orig_pt*pred_pt_corr[:, :, 0:1] + pred_pt_corr[:, :, 1:2]
+        
+        if self.mask_reg_cls0:
+            msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_hard_softmax, axis=-1)!=0, tf.float32), axis=-1)
+            out_charge = out_charge*msk_output
+            pred_pt = pred_pt*msk_output
+            pred_eta = pred_eta*msk_output
+            pred_sin_phi = pred_sin_phi*msk_output
+            pred_cos_phi = pred_cos_phi*msk_output
+            pred_energy = pred_energy*msk_output
+
+        ret = {
+            "cls": out_id_softmax,
+            "charge": out_charge*msk_input,
+            "pt": pred_pt*msk_input,
+            "eta": pred_eta*msk_input,
+            "sin_phi": pred_sin_phi*msk_input,
+            "cos_phi": pred_cos_phi*msk_input,
+            "energy": pred_energy*msk_input,
+        }
 
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
-        if self.multi_output:
-            return {
-                "cls": out_id_softmax, "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5],
-            }
-        else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
-    
-    def set_trainable_classification(self):
-        for layer in self.layers:
-            layer.trainable = True
-        self.encoder_reg.trainable = False
-        self.decoder_reg.trainable = False
-        self.ffn_momentum.trainable = False
+        return ret
 
     def set_trainable_regression(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.encoder_id.trainable = False
-        self.decoder_id.trainable = False
         self.ffn_id.trainable = False
         self.ffn_charge.trainable = False
+        self.ffn_phi.trainable = False
+        self.ffn_eta.trainable = False
+        self.ffn_pt.trainable = False
+        self.ffn_energy.trainable = True
+        self.ffn_energy_classwise.trainable = True
 
+    def set_trainable_classification(self):
+        self.ffn_id.trainable = True
+        self.ffn_charge.trainable = True
+        self.ffn_phi.trainable = False
+        self.ffn_eta.trainable = False
+        self.ffn_pt.trainable = False
+        self.ffn_energy.trainable = False
+        self.ffn_energy_classwise.trainable = False
 
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
     
         self.max_num_bins = kwargs.pop("max_num_bins")
         self.bin_size = kwargs.pop("bin_size")
-        self.dist_mult = kwargs.pop("dist_mult")
-
         self.distance_dim = kwargs.pop("distance_dim")
-        self.output_dim = kwargs.pop("output_dim")
-        
         self.do_layernorm = kwargs.pop("layernorm")
-        self.clip_value_low = kwargs.pop("clip_value_low")
-        self.num_conv = kwargs.pop("num_conv")
-        self.normalize_degrees = kwargs.pop("normalize_degrees")
+        self.num_node_messages = kwargs.pop("num_node_messages")
         self.dropout = kwargs.pop("dropout")
+        self.kernel = kwargs.pop("kernel")
+        self.node_message = kwargs.pop("node_message")
+        self.hidden_dim = kwargs.pop("hidden_dim")
+        self.do_lsh = kwargs.pop("do_lsh", True)
+        self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
+        self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation", "linear"))
 
         if self.do_layernorm:
-            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
-
-        self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, self.distance_dim)
-        self.dist = ExponentialLSHDistanceDense(clip_value_low=self.clip_value_low, distance_dim=self.distance_dim, max_num_bins=self.max_num_bins , bin_size=self.bin_size, dist_mult=self.dist_mult)
-        self.convs = [GHConvDense(
-            activation=tf.keras.activations.elu,
-            output_dim=self.output_dim,
-            normalize_degrees=self.normalize_degrees
-            ) for iconv in range(self.num_conv)
+            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6, name=kwargs.get("name")+"_layernorm")
+
+        #self.gaussian_noise = tf.keras.layers.GaussianNoise(0.01)
+        self.ffn_dist = point_wise_feed_forward_network(
+            self.distance_dim,
+            self.hidden_dim,
+            kwargs.get("name") + "_ffn_dist",
+            num_layers=2, activation=self.activation,
+            dropout=self.dropout
+        )
+
+        if self.do_lsh:
+            self.message_building_layer = MessageBuildingLayerLSH(
+                distance_dim=self.distance_dim,
+                max_num_bins=self.max_num_bins,
+                bin_size=self.bin_size,
+                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+            )
+        else:
+            self.message_building_layer = MessageBuildingLayerFull(
+                distance_dim=self.distance_dim,
+                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+            )
+
+        self.message_passing_layers = [
+            get_message_layer(self.node_message, "{}_msg_{}".format(kwargs.get("name"), iconv)) for iconv in range(self.num_node_messages)
         ]
         self.dropout_layer = None
         if self.dropout:
@@ -879,177 +696,149 @@ def __init__(self, *args, **kwargs):
 
         super(CombinedGraphLayer, self).__init__(*args, **kwargs)
 
-    def call(self, x, msk, training):
+    def call(self, x, msk, training=False):
 
         if self.do_layernorm:
-            x = self.layernorm(x)
+            x = self.layernorm(x, training=training)
+
+        #compute node features for graph building
+        x_dist = self.dist_activation(self.ffn_dist(x, training=training))
+
+        #x_dist = self.gaussian_noise(x_dist, training=training)
+
+        #compute the element-to-element messages / distance matrix / graph structure
+        if self.do_lsh:
+            bins_split, x, dm, msk_f = self.message_building_layer(x_dist, x, msk)
+        else:
+            dm = self.message_building_layer(x_dist, msk)
+            msk_f = tf.expand_dims(tf.cast(msk, x.dtype), axis=-1)
+            bins_split = None
+
+        #run the node update with message passing
+        for msg in self.message_passing_layers:
+            x = msg((x, dm, msk_f))
+
+            #x = self.gaussian_noise(x, training=training)
 
-        x_dist = self.ffn_dist(x)
-        bins_split, x_binned, dm, msk_binned = self.dist(x_dist, x, msk)
-        for conv in self.convs:
-            x_binned = conv((x_binned, dm, msk_binned))
             if self.dropout_layer:
-                x_binned = self.dropout_layer(x_binned, training)
+                x = self.dropout_layer(x, training=training)
 
-        x_enc = reverse_lsh(bins_split, x_binned)
+        #undo the binning according to the element-to-bin indices
+        if self.do_lsh:
+            x = reverse_lsh(bins_split, x)
 
-        return {"enc": x_enc, "dist": x_dist, "bins": bins_split, "dm": dm}
+        return {"enc": x, "dist": x_dist, "bins": bins_split, "dm": dm}
 
 class PFNetDense(tf.keras.Model):
     def __init__(self,
+            do_node_encoding=False,
+            hidden_dim=128,
+            dropout=0.0,
+            activation="gelu",
             multi_output=False,
             num_input_classes=8,
             num_output_classes=3,
-            num_momentum_outputs=3,
-            max_num_bins=200,
-            bin_size=320,
-            dist_mult=0.1,
-            distance_dim=128,
-            hidden_dim=256,
-            layernorm=False,
-            clip_value_low=0.0,
-            activation=tf.keras.activations.elu,
-            num_conv=2,
-            num_gsl=1,
-            normalize_degrees=False,
-            dropout=0.0,
-            separate_momentum=True,
+            num_graph_layers_common=1,
+            num_graph_layers_energy=1,
             input_encoding="cms",
-            focal_loss_from_logits=False,
-            debug=False
+            skip_connection=True,
+            graph_kernel={},
+            combined_graph_layer={},
+            node_message={},
+            output_decoding={},
+            debug=False,
+            schema="cms"
         ):
         super(PFNetDense, self).__init__()
 
         self.multi_output = multi_output
-        self.num_momentum_outputs = num_momentum_outputs
-        self.activation = activation
-        self.separate_momentum = separate_momentum
-        self.focal_loss_from_logits = focal_loss_from_logits
         self.debug = debug
 
-        self.num_conv = num_conv
-        self.num_gsl = num_gsl
+        self.skip_connection = skip_connection
+        
+        self.do_node_encoding = do_node_encoding
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.activation = getattr(tf.keras.activations, activation)
+
+        if self.do_node_encoding:
+            self.node_encoding = point_wise_feed_forward_network(
+                self.hidden_dim,
+                self.hidden_dim,
+                "node_encoding",
+                num_layers=2,
+                activation=self.activation,
+                dropout=self.dropout
+            )
 
         if input_encoding == "cms":
             self.enc = InputEncodingCMS(num_input_classes)
         elif input_encoding == "default":
             self.enc = InputEncoding(num_input_classes)
 
-        dff = hidden_dim
-        self.ffn_enc_id = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_id")
-        self.ffn_enc_reg = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_reg")
-
-        kwargs_cg = {
-            "output_dim": dff,
-            "max_num_bins": max_num_bins,
-            "bin_size": bin_size,
-            "dist_mult": dist_mult,
-            "distance_dim": distance_dim,
-            "layernorm": layernorm,
-            "clip_value_low": clip_value_low,
-            "num_conv": num_conv,
-            "normalize_degrees": normalize_degrees,
-            "dropout": dropout
-        }
-        self.cg_id = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
-        self.cg_reg = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
+        self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_common)]
+        self.cg_energy = [CombinedGraphLayer(name="cg_energy_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_energy)]
 
-        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=3, activation=activation)
-        self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=3, activation=activation)
-        
-        if self.separate_momentum:
-            self.ffn_momentum = [
-                point_wise_feed_forward_network(
-                    1, dff, name="ffn_momentum{}".format(imomentum),
-                    dtype=tf.dtypes.float32, num_layers=3, activation=activation
-                ) for imomentum in range(num_momentum_outputs)
-            ]
-        else:
-            self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, name="ffn_momentum", dtype=tf.dtypes.float32, num_layers=3, activation=activation)
+        output_decoding["schema"] = schema
+        output_decoding["num_output_classes"] = num_output_classes
+        self.output_dec = OutputDecoding(**output_decoding)
 
     def call(self, inputs, training=False):
         X = inputs
+        debugging_data = {}
+
+        #mask padded elements
         msk = X[:, :, 0] != 0
         msk_input = tf.expand_dims(tf.cast(msk, tf.float32), -1)
 
+        #encode the elements for classification (id)
         enc = self.enc(X)
-        enc_id = self.activation(self.ffn_enc_id(enc))
-        encs_id = []
 
-        debugging_data = {}
-        for cg in self.cg_id:
-            enc_id_all = cg(enc_id, msk, training)
-            enc_id = enc_id_all["enc"]
-            if self.debug:
-                debugging_data[cg.name] = enc_id_all
-            encs_id.append(enc_id)
-
-        enc_reg = self.activation(self.ffn_enc_reg(enc))
-        encs_reg = []
-        for cg in self.cg_reg:
-            enc_reg_all = cg(enc_reg, msk, training)
-            enc_reg = enc_reg_all["enc"]
+
+        encs = []
+        if self.skip_connection:
+            encs.append(enc)
+        enc_cg = enc
+        if self.do_node_encoding:
+            enc_cg = self.node_encoding(enc_cg, training=training)
+        for cg in self.cg:
+            enc_all = cg(enc_cg, msk, training=training)
+            enc_cg = enc_all["enc"]
             if self.debug:
-                debugging_data[cg.name] = enc_reg_all
-            encs_reg.append(enc_reg)
+                debugging_data[cg.name] = enc_all
+            encs.append(enc_cg)
 
-        dec_output_id = tf.concat([enc] + encs_id, axis=-1)
+        dec_input = []
+        dec_input += encs
+        dec_output = tf.concat(dec_input, axis=-1)*msk_input
+        if self.debug:
+            debugging_data["dec_output"] = dec_output
 
-        out_id_logits = self.ffn_id(dec_output_id)*msk_input
+        enc_cg = enc
+        encs_energy = []
+        if self.skip_connection:
+            encs_energy.append(enc)
+        for cg in self.cg_energy:
+            enc_all = cg(enc_cg, msk, training=training)
+            enc_cg = enc_all["enc"]
+            if self.debug:
+                debugging_data[cg.name] = enc_all
+            encs_energy.append(enc_cg)
 
-        if self.focal_loss_from_logits:
-            out_id_softmax = out_id_logits
-        else:
-            out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-
-        out_charge = self.ffn_charge(dec_output_id)*msk_input
-        dec_output_reg = tf.concat([enc, tf.cast(out_id_logits, X.dtype)] + encs_reg, axis=-1)
-       
-        if self.separate_momentum:
-            pred_momentum = [ffn(dec_output_reg) for ffn in self.ffn_momentum]
-            pred_momentum = tf.concat(pred_momentum, axis=-1)*msk_input
-        else:
-            pred_momentum = self.ffn_momentum(dec_output_reg)*msk_input
+        dec_output_energy = tf.concat(encs_energy, axis=-1)*msk_input
+        if self.debug:
+            debugging_data["dec_output_energy"] = dec_output_energy
+
+        ret = self.output_dec([X, dec_output, dec_output_energy, msk_input], training=training)
 
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
+        if self.debug:
+            for k in debugging_data.keys():
+                ret[k] = debugging_data[k]
 
         if self.multi_output:
-            ret = {
-                "cls": out_id_softmax,
-                "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5],
-            }
-            if self.debug:
-                for k in debugging_data.keys():
-                    ret[k] = debugging_data[k]
             return ret
         else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
-
-    def set_trainable_classification(self):
-        self.trainable = True
-        for layer in self.layers:
-            layer.trainable = True
-
-        self.ffn_enc_reg.trainable = False
-        for cg in self.cg_reg:
-            cg.trainable = False
-        self.ffn_momentum.trainable = False
-
-    def set_trainable_regression(self):
-        self.trainable = True
-        for layer in self.layers:
-            layer.trainable = True
-
-        self.ffn_enc_id.trainable = False
-        for cg in self.cg_id:
-            cg.trainable = False
-        self.ffn_id.trainable = False
-        self.ffn_charge.trainable = False
+            return tf.concat([ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1)
 
     def set_trainable_named(self, layer_names):
         self.trainable = True
@@ -1057,8 +846,102 @@ def set_trainable_named(self, layer_names):
         for layer in self.layers:
             layer.trainable = False
 
-        for layer in layer_names:
-            self.get_layer(layer).trainable = True
+        self.output_dec.set_trainable_named(layer_names)
+
+    # def train_step(self, data):
+    #     # Unpack the data. Its structure depends on your model and
+    #     # on what you pass to `fit()`.
+    #     x, y, sample_weights = data
+
+    #     if not hasattr(self, "step"):
+    #         self.step = 0
+
+    #     with tf.GradientTape() as tape:
+    #         y_pred = self(x, training=True)  # Forward pass
+
+    #         #regression losses computed only for correctly classified particles
+    #         pred_cls = tf.argmax(y_pred["cls"], axis=-1)
+    #         true_cls = tf.argmax(y["cls"], axis=-1)
+    #         #msk_loss = tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32)
+    #         #sample_weights["energy"] *= msk_loss
+    #         #sample_weights["pt"] *= msk_loss
+    #         #sample_weights["eta"] *= msk_loss
+    #         #sample_weights["sin_phi"] *= msk_loss
+    #         #sample_weights["cos_phi"] *= msk_loss
+
+    #         for icls in [3, ]:
+    #             msk1 = (true_cls==icls)
+    #             msk2 = (pred_cls==icls)
+    #             import matplotlib
+    #             import matplotlib.pyplot as plt
+
+    #             plt.figure(figsize=(4,4))
+    #             minval = np.min(y["energy"][msk1].numpy().flatten())
+    #             maxval = np.max(y["energy"][msk1].numpy().flatten())
+    #             plt.scatter(
+    #                 y["energy"][msk1&msk2].numpy().flatten(),
+    #                 y_pred["energy"][msk1&msk2].numpy().flatten(),
+    #                 marker=".", alpha=0.5
+    #             )
+    #             plt.xlabel("true")
+    #             plt.ylabel("pred")
+    #             plt.plot([minval,maxval], [minval,maxval], color="black", ls="--", lw=1.0)
+    #             plt.savefig("train_cls{}_{}.png".format(icls, self.step), bbox_inches="tight")
+    #             plt.close("all")
+
+    #         loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+
+    #     # Compute gradients
+    #     trainable_vars = self.trainable_variables
+    #     gradients = tape.gradient(loss, trainable_vars)
+    #     # Update weights
+    #     self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+    #     # Update metrics (includes the metric that tracks the loss)
+    #     self.compiled_metrics.update_state(y, y_pred)
+    #     # Return a dict mapping metric names to current value
+
+    #     self.step += 1
+    #     return {m.name: m.result() for m in self.metrics}
+
+    # def test_step(self, data):
+    #     # Unpack the data
+    #     x, y, sample_weights = data
+    #     # Compute predictions
+    #     y_pred = self(x, training=False)
+
+    #     pred_cls = tf.argmax(y_pred["cls"], axis=-1)
+    #     true_cls = tf.argmax(y["cls"], axis=-1)
+
+    #     for icls in [3, ]:
+    #         msk1 = (true_cls==icls)
+    #         msk2 = (pred_cls==icls)
+    #         import matplotlib
+    #         import matplotlib.pyplot as plt
+
+    #         plt.figure(figsize=(4,4))
+    #         minval = np.min(y["energy"][msk1].numpy().flatten())
+    #         maxval = np.max(y["energy"][msk1].numpy().flatten())
+    #         plt.scatter(
+    #             y["energy"][msk1&msk2].numpy().flatten(),
+    #             y_pred["energy"][msk1&msk2].numpy().flatten(),
+    #             marker=".", alpha=0.5
+    #         )
+    #         plt.xlabel("true")
+    #         plt.ylabel("pred")
+    #         plt.plot([minval,maxval], [minval,maxval], color="black", ls="--", lw=1.0)
+    #         plt.savefig("test_cls{}_{}.png".format(icls, self.step), bbox_inches="tight")
+    #         plt.close("all")
+
+
+    #     # Updates the metrics tracking the loss
+    #     self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+    #     # Update the metrics.
+    #     self.compiled_metrics.update_state(y, y_pred)
+    #     # Return a dict mapping metric names to current value.
+    #     # Note that it will include the loss (tracked in self.metrics).
+
+    #     self.step += 1
+    #     return {m.name: m.result() for m in self.metrics}
 
 class DummyNet(tf.keras.Model):
     def __init__(self,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index e2a7a4af4..5383587be 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -1,7 +1,6 @@
-from .model import PFNet, Transformer, DummyNet, PFNetDense
+from .model import DummyNet, PFNetDense
 
 import tensorflow as tf
-import tensorflow_probability
 import tensorflow_addons as tfa
 import pickle
 import numpy as np
@@ -20,12 +19,14 @@
 import time
 import json
 import random
+import math
 import platform
+import mplhep
 from tqdm import tqdm
 from pathlib import Path
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.callbacks import CustomTensorBoard
-from tfmodel.utils import get_lr_schedule, make_weight_function, targets_multi_output
+from tfmodel.utils import get_lr_schedule, get_optimizer, make_weight_function, targets_multi_output
 
 
 from tensorflow.keras.metrics import Recall, CategoricalAccuracy
@@ -57,12 +58,22 @@ def plot_to_image(figure):
     
     return image
 
+
 class CustomCallback(tf.keras.callbacks.Callback):
-    def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
+    def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_classes, plot_freq=1, comet_experiment=None):
         super(CustomCallback, self).__init__()
         self.X = X
         self.y = y
-        self.dataset_transform = dataset_transform
+        self.plot_freq = plot_freq
+        self.comet_experiment = comet_experiment
+
+        self.dataset_def = dataset_def
+
+        #transform the prediction target from an array into a dictionary for easier access
+        self.ytrue = dataset_transform(self.X, self.y, None)[1]
+        self.ytrue = {k: np.array(v) for k, v in self.ytrue.items()}
+        self.ytrue_id = np.argmax(self.ytrue["cls"], axis=-1)
+
         self.outpath = outpath
         self.num_output_classes = num_output_classes
 
@@ -81,117 +92,353 @@ def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
             11: "gray"
         }
 
-    def on_epoch_end(self, epoch, logs=None):
-
-        with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
-            json.dump(logs, fi)
+        self.reg_bins = {
+            "pt": np.linspace(0, 100, 100),
+            "eta": np.linspace(-6, 6, 100),
+            "sin_phi": np.linspace(-1,1,100),
+            "cos_phi": np.linspace(-1,1,100),
+            "energy": None,
+        }
 
-        ypred = self.model(self.X, training=False)
-        #ypred["cls"] = np.clip(ypred["cls"], 0.5, 1.0)
-        
-        ypred_id = np.argmax(ypred["cls"], axis=-1)
+    def plot_cm(self, epoch, outpath, ypred_id, msk):
 
-        ibatch = 0
-       
-        msk = self.X[:, :, 0] != 0
-        # cm = sklearn.metrics.confusion_matrix(
-        #     self.y[msk][:, 0].astype(np.int64).flatten(),
-        #     ypred_id[msk].flatten(), labels=list(range(self.num_output_classes))
-        # )
-        # figure = plot_confusion_matrix(cm)
-        # plt.savefig("{}/cm_{}.pdf".format(self.outpath, epoch), bbox_inches="tight")
-        # plt.close("all")
+        ytrue_id_flat = self.ytrue_id[msk].astype(np.int64).flatten()
+        ypred_id_flat = ypred_id[msk].flatten()
 
         cm = sklearn.metrics.confusion_matrix(
-            self.y[msk][:, 0].astype(np.int64).flatten(),
-            ypred_id[msk].flatten(), labels=list(range(self.num_output_classes)), normalize="true"
+            ytrue_id_flat,
+            ypred_id_flat, labels=list(range(self.num_output_classes)), normalize="true"
         )
+        if self.comet_experiment:
+            self.comet_experiment.log_confusion_matrix(
+                file_name="confusion-matrix-epoch{}.json".format(epoch), matrix=cm, epoch=epoch
+            )
+
         figure = plot_confusion_matrix(cm)
 
         acc = sklearn.metrics.accuracy_score(
-            self.y[msk][:, 0].astype(np.int64).flatten(),
-            ypred_id[msk].flatten()
+            ytrue_id_flat,
+            ypred_id_flat
         )
         balanced_acc = sklearn.metrics.balanced_accuracy_score(
-            self.y[msk][:, 0].astype(np.int64).flatten(),
-            ypred_id[msk].flatten()
+            ytrue_id_flat,
+            ypred_id_flat
         )
         plt.title("acc={:.3f} bacc={:.3f}".format(acc, balanced_acc))
-        plt.savefig("{}/cm_normed_{}.pdf".format(self.outpath, epoch), bbox_inches="tight")
+
+        image_path = str(outpath / "cm_normed.png")
+        plt.savefig(image_path, bbox_inches="tight")
         plt.close("all")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+    def plot_event_visualization(self, epoch, outpath, ypred, ypred_id, msk, ievent=0):
 
-        # for icls in range(self.num_output_classes):
-        #     fig = plt.figure(figsize=(4,4))
-        #     msk = self.y[:, :, 0] == icls
-        #     msk = msk.flatten()
-        #     b = np.linspace(0,1,21)
-        #     ids = ypred["cls"][:, :, icls].numpy().flatten()
-        #     plt.hist(ids[msk], bins=b, density=True, histtype="step", lw=2)
-        #     plt.hist(ids[~msk], bins=b, density=True, histtype="step", lw=2)
-        #     plt.savefig("{}/cls{}_{}.pdf".format(self.outpath, icls, epoch), bbox_inches="tight")
-        # for icls in range(self.num_output_classes):
-        #     n_pred = np.sum(self.y[:, :, 0]==icls, axis=1)
-        #     n_true = np.sum(ypred_id==icls, axis=1)
-        #     figure = plot_num_particle(n_pred, n_true, icls)
-        #     plt.savefig("{}/num_cls{}_{}.pdf".format(self.outpath, icls, epoch), bbox_inches="tight")
+        X_eta, X_phi, X_energy = self.dataset_def.get_X_eta_phi_energy(self.X)
 
         fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(3*5, 5))
 
+        #Plot the input PFElements
         plt.axes(ax1)
-        msk = self.X[ibatch, :, 0] != 0
-        eta = self.X[ibatch][msk][:, 2]
-        phi = self.X[ibatch][msk][:, 3]
-        energy = self.X[ibatch][msk][:, 4]
-        typ = self.X[ibatch][msk][:, 0]
+        msk = self.X[ievent, :, 0] != 0
+        eta = X_eta[ievent][msk]
+        phi = X_phi[ievent][msk]
+        energy = X_energy[ievent][msk]
+        typ = self.X[ievent][msk][:, 0]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in typ], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        plt.axes(ax3)
         #Plot the predicted particles
-        msk = ypred_id[ibatch] != 0
-        eta = ypred["eta"][ibatch][msk]
-        sphi = ypred["sin_phi"][ibatch][msk]
-        cphi = ypred["cos_phi"][ibatch][msk]
+        plt.axes(ax3)
+        msk = ypred_id[ievent] != 0
+        eta = ypred["eta"][ievent][msk]
+        sphi = ypred["sin_phi"][ievent][msk]
+        cphi = ypred["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = ypred["energy"][ibatch][msk]
-        pdgid = ypred_id[ibatch][msk]
+        energy = ypred["energy"][ievent][msk]
+        pdgid = ypred_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        # Xconcat = np.concatenate([self.X[ibatch], ypred["cls"][ibatch]], axis=-1)
-        # np.savez(self.outpath + "/event_{}.npz".format(epoch), Xconcat[Xconcat[:, 0]!=0])
-
         #Plot the target particles
         plt.axes(ax2)
-        y = self.dataset_transform(self.X, self.y, None)[1]
-        y_id = np.argmax(y["cls"], axis=-1)
-        msk = y_id[ibatch] != 0
-        eta = y["eta"][ibatch][msk]
-        sphi = y["sin_phi"][ibatch][msk]
-        cphi = y["cos_phi"][ibatch][msk]
+        
+        msk = self.ytrue_id[ievent] != 0
+        eta = self.ytrue["eta"][ievent][msk]
+        sphi = self.ytrue["sin_phi"][ievent][msk]
+        cphi = self.ytrue["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = y["energy"][ibatch][msk]
-        pdgid = y_id[ibatch][msk]
+        energy = self.ytrue["energy"][ievent][msk]
+        pdgid = self.ytrue_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        plt.savefig("{}/event_{}.pdf".format(self.outpath, epoch), bbox_inches="tight")
+        image_path = str(outpath / "event_iev{}.png".format(ievent))
+        plt.savefig(image_path, bbox_inches="tight")
+        plt.close("all")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+    def plot_reg_distribution(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
+
+        if icls==0:
+            vals_pred = ypred[reg_variable][ypred_id!=icls].flatten()
+            vals_true = self.ytrue[reg_variable][self.ytrue_id!=icls].flatten()
+        else:
+            vals_pred = ypred[reg_variable][ypred_id==icls].flatten()
+            vals_true = self.ytrue[reg_variable][self.ytrue_id==icls].flatten()
+
+        bins = self.reg_bins[reg_variable]
+        if bins is None:
+            bins = 100
+
+        plt.figure()
+        plt.hist(vals_true, bins=bins, histtype="step", lw=2, label="true")
+        plt.hist(vals_pred, bins=bins, histtype="step", lw=2, label="predicted")
+
+        if reg_variable in ["pt", "energy"]:
+            plt.yscale("log")
+            plt.ylim(bottom=1e-2)
+
+        plt.xlabel(reg_variable)
+        plt.ylabel("Number of particles")
+        plt.legend(loc="best")
+        plt.title("Regression output, cls {}".format(icls))
+        image_path = str(outpath / "{}_cls{}.png".format(reg_variable, icls))
+        plt.savefig(image_path, bbox_inches="tight")
+        plt.close("all")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+    def plot_corr(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
+
+        if icls==0:
+            sel = (ypred_id!=0) & (self.ytrue_id!=0)
+        else:
+            sel = (ypred_id==icls) & (self.ytrue_id==icls)
+
+        vals_pred = ypred[reg_variable][sel].flatten()
+        vals_true = self.ytrue[reg_variable][sel].flatten()
+
+        loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
+        loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
+
+        #save scatterplot of raw values
+        plt.figure()
+        bins = self.reg_bins[reg_variable]
+        if bins is None:
+            bins = 100
+        plt.scatter(vals_true, vals_pred, marker=".", alpha=0.4)
+
+        if len(vals_true) > 0:
+            minval = np.min(vals_true)
+            maxval = np.max(vals_true)
+            if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
+                plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
+        plt.xlabel("true")
+        plt.ylabel("predicted")
+        plt.title("{}, particle weighted, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
+        image_path = str(outpath / "{}_cls{}_corr.png".format(reg_variable, icls))
+        plt.savefig(image_path, bbox_inches="tight")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
         plt.close("all")
 
-        np.savez("{}/pred_{}.npz".format(self.outpath, epoch), X=self.X, ytrue=self.y, **ypred)
+        #save loss-weighted correlation histogram
+        plt.figure()
+        plt.hist2d(vals_true, vals_pred, bins=(bins, bins), weights=loss_vals, cmap="Blues")
+        plt.colorbar()
+        if len(vals_true) > 0:
+            minval = np.min(vals_true)
+            maxval = np.max(vals_true)
+            if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
+                plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
+        plt.xlabel("true")
+        plt.ylabel("predicted")
+        plt.title("{}, loss weighted, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
+        image_path = str(outpath / "{}_cls{}_corr_weighted.png".format(reg_variable, icls))
+        plt.savefig(image_path, bbox_inches="tight")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+        #Also plot the residuals, as we have the true and predicted values already available here
+        plt.figure()
+        residual = vals_true - vals_pred
+        residual[np.isnan(residual)] = 0
+        residual[np.isinf(residual)] = 0
+        plt.hist(residual, bins=100)
+        plt.yscale("log")
+        plt.xlabel("true - pred")
+        plt.title("{} residual, m={:.4f} s={:.4f}".format(reg_variable, np.mean(residual), np.std(residual)))
+
+        image_path = str(outpath / "{}_cls{}_residual.png".format(reg_variable, icls))
+        plt.savefig(image_path, bbox_inches="tight")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+        plt.close("all")
+
+        if self.comet_experiment:
+            self.comet_experiment.log_metric('residual_{}_cls{}_mean'.format(reg_variable, icls), np.mean(residual), step=epoch)
+            self.comet_experiment.log_metric('residual_{}_cls{}_std'.format(reg_variable, icls), np.std(residual), step=epoch)
+            self.comet_experiment.log_metric('val_loss_{}_cls{}'.format(reg_variable, icls), np.sum(loss_vals), step=epoch)
+
+    def plot_elem_to_pred(self, epoch, cp_dir, msk, ypred_id):
+        X_id = self.X[msk][:, 0]
+        max_elem = int(np.max(X_id))
+        cand_id = self.ytrue_id[msk]
+        pred_id = ypred_id[msk]
+        cm1 = sklearn.metrics.confusion_matrix(X_id, cand_id, labels=range(max_elem))
+        cm2 = sklearn.metrics.confusion_matrix(X_id, pred_id, labels=range(max_elem))
+
+        plt.figure(figsize=(10,4))
+
+        ax = plt.subplot(1,2,1)
+        plt.title("Targets")
+        plt.imshow(cm1, cmap="Blues", norm=matplotlib.colors.LogNorm())
+        plt.xticks(range(12));
+        plt.yticks(range(12));
+        plt.xlabel("Particle id")
+        plt.ylabel("PFElement id")
+        plt.colorbar()
+
+        ax = plt.subplot(1,2,2)
+        plt.title("Predictions")
+        plt.imshow(cm2, cmap="Blues", norm=matplotlib.colors.LogNorm())
+        plt.xticks(range(12));
+        plt.yticks(range(12));
+        plt.xlabel("Particle id")
+        plt.ylabel("PFElement id")
+        plt.colorbar()
+
+        image_path = str(cp_dir / "elem_to_pred.png")
+        plt.savefig(image_path, bbox_inches="tight")
+        plt.close("all")
+
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+    def plot_eff_and_fake_rate(
+        self,
+        epoch,
+        icls,
+        msk,
+        ypred_id,
+        cp_dir,
+        ivar=4,
+        bins=np.linspace(0, 200, 100),
+        xlabel="PFElement E",
+        log_var=False,
+        do_log_y=True
+        ):
+        
+        values = self.X[msk][:, ivar]
+        cand_id = self.ytrue_id[msk]
+        pred_id = ypred_id[msk]
+
+        if log_var:
+            values = np.log(values)
+            
+        hist_cand = np.histogram(values[(cand_id==icls)], bins=bins);
+        hist_cand_true = np.histogram(values[(cand_id==icls) & (pred_id==icls)], bins=bins);
+
+        hist_pred = np.histogram(values[(pred_id==icls)], bins=bins);
+        hist_pred_fake = np.histogram(values[(cand_id!=icls) & (pred_id==icls)], bins=bins);
+
+        eff = hist_cand_true[0]/hist_cand[0]
+        fake = hist_pred_fake[0]/hist_pred[0]
+
+        plt.figure(figsize=(8,8))
+        ax = plt.subplot(2,1,1)
+        mplhep.histplot(hist_cand, label="PF")
+        mplhep.histplot(hist_pred, label="MLPF")
+        plt.legend()
+        plt.xlabel(xlabel)
+        plt.ylabel("Number of particles")
+        if do_log_y:
+            ax.set_yscale("log")
+
+        ax = plt.subplot(2,1,2, sharex=ax)
+        mplhep.histplot(eff, bins=hist_cand[1], label="efficiency", color="black")
+        mplhep.histplot(fake, bins=hist_cand[1], label="fake rate", color="red")
+        plt.legend(frameon=False)
+        plt.ylim(0, 1.4)
+        plt.xlabel(xlabel)
+        plt.ylabel("Fraction of particles / bin")
+
+        image_path = str(cp_dir / "eff_fake_cls{}.png".format(icls))
+        plt.savefig(image_path, bbox_inches="tight")
+        plt.close("all")
+
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+    def on_epoch_end(self, epoch, logs=None):
+
+        #first epoch is 1, not 0
+        epoch = epoch + 1
+
+        #save the training logs (losses) for this epoch
+        with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
+            json.dump(logs, fi)
+
+        if self.plot_freq>1:
+            if epoch%self.plot_freq!=0 or epoch==1:
+                return
+
+        cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)
+        cp_dir.mkdir(parents=True, exist_ok=True)
+
+        #run the model inference on the validation dataset
+        ypred = self.model.predict(self.X, batch_size=1)
+        #ypred = self.model(self.X, training=False)
+        #ypred = {k: v.numpy() for k, v in ypred.items()}
+
+        #choose the class with the highest probability as the prediction
+        #this is a shortcut, in actual inference, we may want to apply additional per-class thresholds        
+        ypred_id = np.argmax(ypred["cls"], axis=-1)
+       
+        #exclude padded elements from the plotting
+        msk = self.X[:, :, 0] != 0
+
+        self.plot_elem_to_pred(epoch, cp_dir, msk, ypred_id)
+
+        self.plot_cm(epoch, cp_dir, ypred_id, msk)
+        for ievent in range(min(5, self.X.shape[0])):
+            self.plot_event_visualization(epoch, cp_dir, ypred, ypred_id, msk, ievent=ievent)
+
+        for icls in range(self.num_output_classes):
+            cp_dir_cls = cp_dir / "cls_{}".format(icls)
+            cp_dir_cls.mkdir(parents=True, exist_ok=True)
+
+            if icls!=0:
+                self.plot_eff_and_fake_rate(epoch, icls, msk, ypred_id, cp_dir_cls)
+
+            for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
+                self.plot_reg_distribution(epoch, cp_dir_cls, ypred, ypred_id, icls, variable)
+                self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, icls, variable)
+
+        np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
-def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes):
+def prepare_callbacks(
+        callbacks_cfg, outdir,
+        X_val, y_val,
+        dataset_transform,
+        num_output_classes,
+        dataset_def,
+        comet_experiment=None):
     callbacks = []
     tb = CustomTensorBoard(
-        log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
+        log_dir=outdir + "/logs", histogram_freq=callbacks_cfg["tensorboard"]["hist_freq"], write_graph=False, write_images=False,
         update_freq='epoch',
         #profile_batch=(10,90),
         profile_batch=0,
+        dump_history=callbacks_cfg["tensorboard"]["dump_history"],
     )
-    tb.set_model(model)
+    # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
+    tb.__class__.__name__ = "TensorBoard"
     callbacks += [tb]
 
     terminate_cb = tf.keras.callbacks.TerminateOnNaN()
@@ -201,17 +448,24 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     cp_dir.mkdir(parents=True, exist_ok=True)
     cp_callback = tf.keras.callbacks.ModelCheckpoint(
         filepath=str(cp_dir / "weights-{epoch:02d}-{val_loss:.6f}.hdf5"),
-        save_weights_only=True,
-        verbose=0
+        save_weights_only=callbacks_cfg["checkpoint"]["save_weights_only"],
+        verbose=0,
+        monitor=callbacks_cfg["checkpoint"]["monitor"],
+        save_best_only=callbacks_cfg["checkpoint"]["save_best_only"],
     )
-    cp_callback.set_model(model)
     callbacks += [cp_callback]
 
     history_path = Path(outdir) / "history"
     history_path.mkdir(parents=True, exist_ok=True)
     history_path = str(history_path)
-    cb = CustomCallback(history_path, X_val, y_val, dataset_transform, num_output_classes)
-    cb.set_model(model)
+    cb = CustomCallback(
+        dataset_def, history_path,
+        X_val, y_val,
+        dataset_transform,
+        num_output_classes,
+        plot_freq=callbacks_cfg["plot_freq"],
+        comet_experiment=comet_experiment
+    )
 
     callbacks += [cb]
 
@@ -239,104 +493,55 @@ def scale_outputs(X,y,w):
 
 def make_model(config, dtype):
     model = config['parameters']['model']
-    if model == 'gnn':
-        return make_gnn(config, dtype)
-    elif model == 'transformer':
-        return make_transformer(config, dtype)
-    elif model == 'dense':
+
+    if model == 'dense':
         return make_dense(config, dtype)
     elif model == 'gnn_dense':
         return make_gnn_dense(config, dtype)
-    raise KeyError("Unknown model type {}".format(model))
-
-def make_gnn(config, dtype):
-    activation = getattr(tf.nn, config['parameters']['activation'])
 
-    parameters = [
-        'bin_size',
-        'num_convs_id',
-        'num_convs_reg',
-        'num_hidden_id_enc',
-        'num_hidden_id_dec',
-        'num_hidden_reg_enc',
-        'num_hidden_reg_dec',
-        'num_neighbors',
-        'hidden_dim_id',
-        'hidden_dim_reg',
-        'dist_mult',
-        'distance_dim',
-        'dropout',
-        'skip_connection'
-    ]
-    kwargs = {par: config['parameters'][par] for par in parameters}
-
-    model = PFNet(
-        multi_output=config["setup"]["multi_output"],
-        num_input_classes=config["dataset"]["num_input_classes"],
-        num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
-        activation=activation,
-        **kwargs
-    )
-
-    return model
+    raise KeyError("Unknown model type {}".format(model))
 
 def make_gnn_dense(config, dtype):
 
     parameters = [
-        "layernorm",
+        "do_node_encoding",
         "hidden_dim",
-        "bin_size",
-        "clip_value_low",
-        "num_conv",
-        "num_gsl",
-        "normalize_degrees",
-        "distance_dim",
         "dropout",
-        "separate_momentum",
+        "activation",
+        "num_graph_layers_common",
+        "num_graph_layers_energy",
         "input_encoding",
+        "skip_connection",
+        "output_decoding",
+        "combined_graph_layer",
         "debug"
     ]
 
-    kwargs = {par: config['parameters'][par] for par in parameters}
+    kwargs = {}
+    for par in parameters:
+        if par in config['parameters'].keys():
+            kwargs[par] = config['parameters'][par]
 
     model = PFNetDense(
         multi_output=config["setup"]["multi_output"],
         num_input_classes=config["dataset"]["num_input_classes"],
         num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
+        schema=config["dataset"]["schema"],
         **kwargs
     )
 
     return model
 
-def make_transformer(config, dtype):
-    parameters = [
-        'num_layers', 'd_model', 'num_heads', 'dff', 'support', 'dropout'
-    ]
-    kwargs = {par: config['parameters'][par] for par in parameters}
-
-    model = Transformer(
-        multi_output=config["setup"]["multi_output"],
-        num_input_classes=config["dataset"]["num_input_classes"],
-        num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
-        dtype=dtype,
-        **kwargs
-    )
-    return model
-
 def make_dense(config, dtype):
     model = DummyNet(
         num_input_classes=config["dataset"]["num_input_classes"],
         num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
     )
     return model
 
 def eval_model(X, ygen, ycand, model, config, outdir, global_batch_size):
     import scipy
-    for ibatch in tqdm(range(X.shape[0]//global_batch_size), desc="Evaluating model"):
+    for ibatch in tqdm(range(max(1, X.shape[0]//global_batch_size)), desc="Evaluating model"):
         nb1 = ibatch*global_batch_size
         nb2 = (ibatch+1)*global_batch_size
 
@@ -446,14 +651,25 @@ def on_epoch_end(self, epoch, numpy_logs):
 
 def configure_model_weights(model, trainable_layers):
     print("setting trainable layers: {}".format(trainable_layers))
+
     if (trainable_layers is None):
         trainable_layers = "all"
+
     if trainable_layers == "all":
         model.trainable = True
-    elif trainable_layers == "classification":
-        model.set_trainable_classification()
     elif trainable_layers == "regression":
-        model.set_trainable_regression()
+        for cg in model.cg:
+            cg.trainable = False
+        for cg in model.cg_energy:
+            cg.trainable = True
+        model.output_dec.set_trainable_regression()
+    elif trainable_layers == "classification":
+        for cg in model.cg:
+            cg.trainable = True
+        for cg in model.cg_energy:
+            cg.trainable = False
+
+        model.output_dec.set_trainable_classification()
     else:
         if isinstance(trainable_layers, str):
             trainable_layers = [trainable_layers]
@@ -472,285 +688,3 @@ def loss(x,y):
             from_logits=bool(config["setup"].get("focal_loss_from_logits", False))
         )
     return loss
-
-def main(args, yaml_path, config):
-    #tf.debugging.enable_check_numerics()
-
-    #Switch off multi-output for the evaluation for backwards compatibility
-    multi_output = True
-    if args.action == "eval":
-        multi_output = False
-
-    tf.config.run_functions_eagerly(config['tensorflow']['eager'])
-
-    from tfmodel.data import Dataset
-    cds = config["dataset"]
-
-    raw_path = cds.get("raw_path", None)
-    if args.raw_path:
-        raw_path = args.raw_path
-
-    processed_path = cds.get("processed_path", None)
-    if args.processed_path:
-        processed_path = args.processed_path
-
-    dataset_def = Dataset(
-        num_input_features=int(cds["num_input_features"]),
-        num_output_features=int(cds["num_output_features"]),
-        padded_num_elem_size=int(cds["padded_num_elem_size"]),
-        raw_path=raw_path,
-        raw_files=cds.get("raw_files", None),
-        processed_path=processed_path,
-        validation_file_path=cds["validation_file_path"],
-        schema=cds["schema"]
-    )
-
-    if args.action == "data":
-        dataset_def.process(
-            config["dataset"]["num_files_per_chunk"]
-        )
-        return
-
-    global_batch_size = config['setup']['batch_size']
-    config['setup']['multi_output'] = multi_output
-
-    model_name = os.path.splitext(os.path.basename(yaml_path))[0] + "-" + str(uuid.uuid4())[:8] + "." + platform.node()
-    print("model_name=", model_name)
-
-    tfr_files = sorted(glob.glob(dataset_def.processed_path))
-    if len(tfr_files) == 0:
-        raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
-
-    random.shuffle(tfr_files)
-    dataset = tf.data.TFRecordDataset(tfr_files).map(dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-    num_events = 0
-    for i in dataset:
-        num_events += 1
-    print("dataset loaded, len={}".format(num_events))
-
-    n_train = config['setup']['num_events_train']
-    n_test = config['setup']['num_events_test']
-
-    if args.ntrain:
-        n_train = args.ntrain
-    if args.ntest:
-        n_test = args.ntest
-
-    n_epochs = config['setup']['num_epochs']
-    weight_func = make_weight_function(config)
-    assert(n_train + n_test <= num_events)
-
-    ps = (
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
-        {
-            "cls": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "charge": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "energy": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "pt": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "eta": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "sin_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "cos_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-        }
-    )
-
-    ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-
-    dataset_transform = None
-    if multi_output:
-        dataset_transform = targets_multi_output(config['dataset']['num_output_classes'])
-        ds_train = ds_train.map(dataset_transform)
-        ds_test = ds_test.map(dataset_transform)
-
-    ds_train_r = ds_train.repeat(n_epochs)
-    ds_test_r = ds_test.repeat(n_epochs)
-
-    #small test dataset used in the callback for making monitoring plots
-    #X_test = np.concatenate(list(ds_test.take(100).map(lambda x,y,w: x).as_numpy_iterator()))
-    #y_test = np.concatenate(list(ds_test.take(100).map(lambda x,y,w: tf.concat(y, axis=-1)).as_numpy_iterator()))
-
-    weights = config['setup']['weights']
-    if args.weights:
-        weights = args.weights
-
-    if args.recreate or (weights is None):
-        outdir = 'experiments/{}'.format(model_name)
-        if os.path.isdir(outdir):
-            print("Output directory exists: {}".format(outdir), file=sys.stderr)
-            sys.exit(1)
-    else:
-        outdir = str(Path(weights).parent.parent)
-
-    try:
-        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
-        num_gpus = len(gpus)
-        print("num_gpus=", num_gpus)
-        if num_gpus > 1:
-            strategy = tf.distribute.MirroredStrategy()
-            global_batch_size = num_gpus * global_batch_size
-        else:
-            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
-    except Exception as e:
-        print("fallback to CPU", e)
-        strategy = tf.distribute.OneDeviceStrategy("cpu")
-        num_gpus = 0
-    
-    Xs = []
-    ygens = []
-    ycands = []
-    #for faster loading        
-    if args.action == "train":
-        val_filelist = dataset_def.val_filelist[:1]
-    else:
-        val_filelist = dataset_def.val_filelist
-        if config['setup']['num_val_files']>0:
-            val_filelist = val_filelist[:config['setup']['num_val_files']]
-
-    for fi in val_filelist:
-        X, ygen, ycand = dataset_def.prepare_data(fi)
-
-        Xs.append(np.concatenate(X))
-        ygens.append(np.concatenate(ygen))
-        ycands.append(np.concatenate(ycand))
-
-    assert(len(Xs) > 0)
-    X_val = np.concatenate(Xs)
-    ygen_val = np.concatenate(ygens)
-    ycand_val = np.concatenate(ycands)
-
-    lr = float(config['setup']['lr'])
-    with strategy.scope():
-        total_steps = n_epochs * n_train // global_batch_size
-        lr_schedule, optim_callbacks = get_lr_schedule(config, lr, steps=total_steps)
-        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
-        if config['setup']['dtype'] == 'float16':
-            model_dtype = tf.dtypes.float16
-            from tensorflow.keras import mixed_precision
-            policy = mixed_precision.Policy('mixed_float16')
-            mixed_precision.set_global_policy(policy)
-            opt = mixed_precision.LossScaleOptimizer(opt)
-        else:
-            model_dtype = tf.dtypes.float32
-
-        if args.action=="train" or args.action=="eval":
-            model = make_model(config, model_dtype)
-
-            #Evaluate model once to build the layers
-            print(X_val.shape)
-            model(tf.cast(X_val[:1], model_dtype))
-
-            initial_epoch = 0
-            if weights:
-                #need to load the weights in the same trainable configuration as the model was set up
-                configure_model_weights(model, config["setup"].get("weights_config", "all"))
-                model.load_weights(weights, by_name=True)
-                initial_epoch = int(weights.split("/")[-1].split("-")[1])
-            model(tf.cast(X_val[:1], model_dtype))
-
-            if config["setup"]["trainable"] == "classification":
-                config["dataset"]["pt_loss_coef"] = 0.0
-                config["dataset"]["eta_loss_coef"] = 0.0
-                config["dataset"]["sin_phi_loss_coef"] = 0.0
-                config["dataset"]["cos_phi_loss_coef"] = 0.0
-                config["dataset"]["energy_loss_coef"] = 0.0
-            elif config["setup"]["trainable"] == "regression":
-                config["dataset"]["classification_loss_coef"] = 0.0
-                config["dataset"]["charge_loss_coef"] = 0.0
-
-            #now set the desirable layers as trainable for the optimization
-            configure_model_weights(model, config["setup"]["trainable"])
-            model(tf.cast(X_val[:1], model_dtype))
-
-            if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
-                cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
-            elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
-                cls_loss = make_focal_loss(config)
-            else:
-                raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
-            
-            model.compile(
-                loss={
-                    "cls": cls_loss,
-                    "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
-                    "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
-                    "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
-                    "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
-                    "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
-                    "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
-                },
-                optimizer=opt,
-                sample_weight_mode='temporal',
-                loss_weights={
-                    "cls": config["dataset"]["classification_loss_coef"],
-                    "charge": config["dataset"]["charge_loss_coef"],
-                    "pt": config["dataset"]["pt_loss_coef"],
-                    "eta": config["dataset"]["eta_loss_coef"],
-                    "sin_phi": config["dataset"]["sin_phi_loss_coef"],
-                    "cos_phi": config["dataset"]["cos_phi_loss_coef"],
-                    "energy": config["dataset"]["energy_loss_coef"],
-                },
-                metrics={
-                    "cls": [
-                        FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
-                        FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
-                    ] + [
-                        SingleClassRecall(
-                            icls,
-                            name="rec_cls{}".format(icls),
-                            dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"])
-                    ]
-                }
-            )
-            model.summary()
-            
-            if args.action=="train":
-                #file_writer_cm = tf.summary.create_file_writer(outdir + '/val_extra')
-                callbacks = prepare_callbacks(
-                    model, outdir, X_val[:config['setup']['batch_size']], ycand_val[:config['setup']['batch_size']],
-                    dataset_transform, config["dataset"]["num_output_classes"]
-                )
-                callbacks.append(optim_callbacks)
-
-                fit_result = model.fit(
-                    ds_train_r, validation_data=ds_test_r, epochs=initial_epoch+n_epochs, callbacks=callbacks,
-                    steps_per_epoch=n_train//global_batch_size, validation_steps=n_test//global_batch_size,
-                    initial_epoch=initial_epoch
-                )
-                history_path = Path(outdir) / "history"
-                history_path = str(history_path)
-                with open("{}/history.json".format(history_path), "w") as fi:
-                    json.dump(fit_result.history, fi)
-                model.save(outdir + "/model_full", save_format="tf")
-            
-            if args.action=="eval":
-                eval_model(X_val, ygen_val, ycand_val, model, config, outdir, global_batch_size)
-                freeze_model(model, config, outdir)
-
-        if args.action=="time":
-            synthetic_timing_data = []
-            for iteration in range(config["timing"]["num_iter"]):
-                numev = config["timing"]["num_ev"]
-                for evsize in [128*10, 128*20, 128*30, 128*40, 128*50, 128*60, 128*70, 128*80, 128*90, 128*100]:
-                    for batch_size in [1,2,3,4]:
-                        x = np.random.randn(batch_size, evsize, config["dataset"]["num_input_features"]).astype(np.float32)
-
-                        model = make_model(config, model_dtype)
-                        model(x)
-
-                        if weights:
-                            model.load_weights(weights)
-
-                        t0 = time.time()
-                        for i in range(numev//batch_size):
-                            model(x)
-                        t1 = time.time()
-                        dt = t1 - t0
-
-                        time_per_event = 1000.0*(dt / numev)
-                        synthetic_timing_data.append(
-                                [{"iteration": iteration, "batch_size": batch_size, "event_size": evsize, "time_per_event": time_per_event}])
-                        print("Synthetic random data: batch_size={} event_size={}, time={:.2f} ms/ev".format(batch_size, evsize, time_per_event))
-            with open("{}/synthetic_timing.json".format(outdir), "w") as fi:
-                json.dump(synthetic_timing_data, fi)
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 909e8c2f3..bfac0907b 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -11,10 +11,13 @@
 
 import tensorflow as tf
 import tensorflow_addons as tfa
+import keras_tuner as kt
 
 from tfmodel.data import Dataset
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 
+from ray.tune.schedulers import AsyncHyperBandScheduler, HyperBandScheduler
+
 
 def load_config(config_file_path):
     with open(config_file_path, "r") as ymlfile:
@@ -48,14 +51,15 @@ def parse_config(config, ntrain=None, ntest=None, weights=None):
 
 def create_experiment_dir(prefix=None, suffix=None):
     if prefix is None:
-        train_dir = Path("experiments") / datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        train_dir = Path("experiments") / datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     else:
-        train_dir = Path("experiments") / (prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
+        train_dir = Path("experiments") / (prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f"))
 
     if suffix is not None:
         train_dir = train_dir.with_name(train_dir.name + "." + platform.node())
 
     train_dir.mkdir(parents=True)
+    print("Creating experiment dir {}".format(train_dir))
     return str(train_dir)
 
 
@@ -73,7 +77,7 @@ def delete_all_but_best_checkpoint(train_dir, dry_run):
     if len(checkpoint_list) == 1:
         raise UserWarning("There is only one checkpoint. No deletion was made.")
     elif len(checkpoint_list) == 0:
-        raise UserWarning("Couldn't find ant checkpoints. No deletion was made.")
+        raise UserWarning("Couldn't find any checkpoints. No deletion was made.")
     else:
         # Sort the checkpoints according to the loss in their filenames
         checkpoint_list.sort(key=lambda x: float(re.search("\d+-\d+.\d+", str(x))[0].split("-")[-1]))
@@ -86,23 +90,31 @@ def delete_all_but_best_checkpoint(train_dir, dry_run):
 
 
 def get_strategy(global_batch_size):
-    try:
-        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
+    if isinstance(os.environ.get("CUDA_VISIBLE_DEVICES"), type(None)) or len(os.environ.get("CUDA_VISIBLE_DEVICES")) == 0:
+        gpus = [-1]
+        print("WARNING: CUDA_VISIBLE_DEVICES variable is empty. \
+            If you don't have or intend to use GPUs, this message can be ignored.")
+    else:
+        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "-1").split(",")]
+    if gpus[0] == -1:
+        num_gpus = 0
+    else:
         num_gpus = len(gpus)
-        print("num_gpus=", num_gpus)
-        if num_gpus > 1:
-            strategy = tf.distribute.MirroredStrategy()
-            global_batch_size = num_gpus * global_batch_size
-        else:
-            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
-    except Exception as e:
-        print("fallback to CPU", e)
+    print("num_gpus=", num_gpus)
+    if num_gpus > 1:
+        strategy = tf.distribute.MirroredStrategy()
+        global_batch_size = num_gpus * global_batch_size
+    elif num_gpus == 1:
+        strategy = tf.distribute.OneDeviceStrategy("gpu:0")
+    elif num_gpus == 0:
+        print("fallback to CPU")
         strategy = tf.distribute.OneDeviceStrategy("cpu")
         num_gpus = 0
     return strategy, global_batch_size
 
 
-def get_lr_schedule(config, lr, steps):
+def get_lr_schedule(config, steps):
+    lr = float(config["setup"]["lr"])
     callbacks = []
     schedule = config["setup"]["lr_schedule"]
     if schedule == "onecycle":
@@ -136,6 +148,84 @@ def get_lr_schedule(config, lr, steps):
     return lr_schedule, callbacks
 
 
+def get_optimizer(config, lr_schedule=None):
+    if lr_schedule is None:
+        lr = float(config["setup"]["lr"])
+    else:
+        lr = lr_schedule
+    if config["setup"]["optimizer"] == "adam":
+        cfg_adam = config["optimizer"]["adam"]
+        return tf.keras.optimizers.Adam(learning_rate=lr, amsgrad=cfg_adam["amsgrad"])
+    if config["setup"]["optimizer"] == "adamw":
+        cfg_adamw = config["optimizer"]["adamw"]
+        return tfa.optimizers.AdamW(learning_rate=lr, weight_decay=cfg_adamw["weight_decay"], amsgrad=cfg_adamw["amsgrad"])
+    elif config["setup"]["optimizer"] == "sgd":
+        cfg_sgd = config["optimizer"]["sgd"]
+        return tf.keras.optimizers.SGD(learning_rate=lr, momentum=cfg_sgd["momentum"], nesterov=cfg_sgd["nesterov"])
+    else:
+        raise ValueError("Only 'adam' and 'sgd' are supported optimizers, got {}".format(config["setup"]["optimizer"]))
+
+
+def get_tuner(cfg_hypertune, model_builder, outdir, recreate, strategy):
+    if cfg_hypertune["algorithm"] == "random":
+        print("Keras Tuner: Using RandomSearch")
+        cfg_rand = cfg_hypertune["random"]
+        return kt.RandomSearch(
+            model_builder,
+            objective=cfg_rand["objective"],
+            max_trials=cfg_rand["max_trials"],
+            project_name="mlpf",
+            overwrite=recreate,
+        )
+    elif cfg_hypertune["algorithm"] == "bayesian":
+        print("Keras Tuner: Using BayesianOptimization")
+        cfg_bayes = cfg_hypertune["bayesian"]
+        return kt.BayesianOptimization(
+            model_builder,
+            objective=cfg_bayes["objective"],
+            max_trials=cfg_bayes["max_trials"],
+            num_initial_points=cfg_bayes["num_initial_points"],
+            project_name="mlpf",
+            overwrite=recreate,
+        )
+    elif cfg_hypertune["algorithm"] == "hyperband":
+        print("Keras Tuner: Using Hyperband")
+        cfg_hb = cfg_hypertune["hyperband"]
+        return kt.Hyperband(
+            model_builder,
+            objective=cfg_hb["objective"],
+            max_epochs=cfg_hb["max_epochs"],
+            factor=cfg_hb["factor"],
+            hyperband_iterations=cfg_hb["iterations"],
+            directory=outdir + "/tb",
+            project_name="mlpf",
+            overwrite=recreate,
+            executions_per_trial=cfg_hb["executions_per_trial"],
+            distribution_strategy=strategy,
+        )
+
+
+def get_raytune_schedule(raytune_cfg):
+    if raytune_cfg["sched"] == "asha":
+        return AsyncHyperBandScheduler(
+            metric="val_loss",
+            mode="min",
+            time_attr="training_iteration",
+            max_t=raytune_cfg["asha"]["max_t"],
+            grace_period=raytune_cfg["asha"]["grace_period"],
+            reduction_factor=raytune_cfg["asha"]["reduction_factor"],
+            brackets=raytune_cfg["asha"]["brackets"],
+        )
+    if raytune_cfg["sched"] == "hyperband":
+        return HyperBandScheduler(
+            metric="val_loss",
+            mode="min",
+            time_attr="training_iteration",
+            max_t=raytune_cfg["hyperband"]["max_t"],
+            reduction_factor=raytune_cfg["hyperband"]["reduction_factor"],
+        )
+
+
 def compute_weights_invsqrt(X, y, w):
     wn = tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w)
     wn *= tf.cast(X[:, 0] != 0, tf.float32)
@@ -152,7 +242,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
@@ -177,23 +267,24 @@ def weight_func(X,y,w):
 
 def targets_multi_output(num_output_classes):
     def func(X, y, w):
+
+        msk = tf.expand_dims(tf.cast(y[:, :, 0]!=0, tf.float32), axis=-1)
         return (
             X,
             {
                 "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),
-                "charge": y[:, :, 1:2],
-                "pt": y[:, :, 2:3],
-                "eta": y[:, :, 3:4],
-                "sin_phi": y[:, :, 4:5],
-                "cos_phi": y[:, :, 5:6],
-                "energy": y[:, :, 6:7],
+                "charge": y[:, :, 1:2]*msk,
+                "pt": y[:, :, 2:3]*msk,
+                "eta": y[:, :, 3:4]*msk,
+                "sin_phi": y[:, :, 4:5]*msk,
+                "cos_phi": y[:, :, 5:6]*msk,
+                "energy": y[:, :, 6:7]*msk,
             },
             w,
         )
 
     return func
 
-
 def get_dataset_def(config):
     cds = config["dataset"]
 
@@ -209,7 +300,7 @@ def get_dataset_def(config):
     )
 
 
-def get_train_val_datasets(config, global_batch_size, n_train, n_test):
+def get_train_val_datasets(config, global_batch_size, n_train, n_test, repeat=True):
     dataset_def = get_dataset_def(config)
 
     tfr_files = sorted(glob.glob(dataset_def.processed_path))
@@ -255,11 +346,15 @@ def get_train_val_datasets(config, global_batch_size, n_train, n_test):
     else:
         dataset_transform = None
 
-    ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
-    ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
-
-    return ds_train_r, ds_test_r, dataset_transform
+    # ds_train = ds_train.map(classwise_energy_normalization)
+    # ds_test = ds_train.map(classwise_energy_normalization)
 
+    if repeat:
+        ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
+        ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
+        return ds_train_r, ds_test_r, dataset_transform
+    else:
+        return ds_train, ds_test, dataset_transform
 
 def prepare_val_data(config, dataset_def, single_file=False):
     if single_file:
@@ -296,6 +391,10 @@ def set_config_loss(config, trainable):
     elif trainable == "regression":
         config["dataset"]["classification_loss_coef"] = 0.0
         config["dataset"]["charge_loss_coef"] = 0.0
+        config["dataset"]["pt_loss_coef"] = 0.0
+        config["dataset"]["eta_loss_coef"] = 0.0
+        config["dataset"]["sin_phi_loss_coef"] = 0.0
+        config["dataset"]["cos_phi_loss_coef"] = 0.0
     elif trainable == "all":
         pass
     return config
@@ -311,16 +410,24 @@ def get_class_loss(config):
     return cls_loss
 
 
+def get_loss_from_params(input_dict):
+    input_dict = input_dict.copy()
+    loss_type = input_dict.pop("type")
+    loss_cls = getattr(tf.keras.losses, loss_type)
+    return loss_cls(**input_dict)
+
 def get_loss_dict(config):
     cls_loss = get_class_loss(config)
+
+    default_loss = {"type": "MeanSquaredError"}
     loss_dict = {
         "cls": cls_loss,
-        "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
-        "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
-        "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
-        "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
-        "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
-        "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
+        "charge": get_loss_from_params(config["dataset"].get("charge_loss", default_loss)),
+        "pt": get_loss_from_params(config["dataset"].get("pt_loss", default_loss)),
+        "eta": get_loss_from_params(config["dataset"].get("eta_loss", default_loss)),
+        "sin_phi": get_loss_from_params(config["dataset"].get("sin_phi_loss", default_loss)),
+        "cos_phi": get_loss_from_params(config["dataset"].get("cos_phi_loss", default_loss)),
+        "energy": get_loss_from_params(config["dataset"].get("energy_loss", default_loss)),
     }
     loss_weights = {
         "cls": config["dataset"]["classification_loss_coef"],
diff --git a/notebooks/clic.ipynb b/notebooks/clic.ipynb
index 8d52cbf8a..3f79ffd8e 100644
--- a/notebooks/clic.ipynb
+++ b/notebooks/clic.ipynb
@@ -22,7 +22,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = json.load(bz2.BZ2File(\"/home/joosep/Downloads/pythia6_ttbar_0001_pandora.json.bz2\", \"r\"))"
+    "#data = json.load(bz2.BZ2File(\"/home/joosep/Downloads/pythia6_ttbar_0001_pandora.json.bz2\", \"r\"))\n",
+    "data = json.load(bz2.BZ2File(\"/home/joosep/particleflow/data/clic/gev380ee_pythia6_ttbar_rfull201/raw/pythia6_ttbar_0001_pandora_0.json.bz2\", \"r\"))"
    ]
   },
   {
@@ -55,7 +56,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "iev = 28\n",
+    "iev = 0\n",
     "df_gen = pandas.DataFrame(data[iev][\"genparticles\"])\n",
     "\n",
     "df_hit = pandas.DataFrame(data[iev][\"track_hits\"])\n",
@@ -71,6 +72,46 @@
     "df_tr[\"pz\"] = df_tr[\"tan_lambda\"]*df_tr[\"pt\"]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6cc1ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_hit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9089cfae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_ecal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2e01940",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_hcal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "efc9be54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_gen"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -150,6 +191,9 @@
     "        if filter_gp(gp):\n",
     "            dg.add_node((\"gp\", gp))\n",
     "            gps.add(gp)\n",
+    "            \n",
+    "            #the track is added to the genparticle with a very high weight\n",
+    "            #because we always want to associate the genparticle to a track if it's possible\n",
     "            dg.add_edge((\"gp\", gp), (\"tr\", itr), weight=9999.0)\n",
     "\n",
     "        \n",
@@ -157,22 +201,26 @@
     "gps = set(gps)\n",
     "\n",
     "#now loop over all the genparticles\n",
-    "#for each genparticle, find the neighboring reco elements (clusters and tracks)\n",
-    "#sort the neighbors by the edge weight (deposited energy)\n",
-    "#for each genparticle, choose the closest neighbor as the \"key\" reco element\n",
-    "#remove the reco element from the list\n",
     "pairs = {}\n",
     "for gp in gps:\n",
     "    gp_node = (\"gp\", gp)\n",
+    "\n",
+    "    #find the neighboring reco elements (clusters and tracks)\n",
     "    neighbors = list(dg.neighbors(gp_node))\n",
     "    weights = [dg.edges[gp_node, n][\"weight\"] for n in neighbors]\n",
     "    nw = zip(neighbors, weights)\n",
+    "    \n",
+    "    #sort the neighbors by the edge weight (deposited energy)\n",
     "    nw = sorted(nw, key=lambda x: x[1], reverse=True)\n",
     "    reco_obj = None\n",
     "    if len(nw)>0:\n",
+    "        #choose the closest neighbor as the \"key\" reco element\n",
     "        reco_obj = nw[0][0]\n",
-    "        dg.remove_node(reco_obj)\n",
     "        \n",
+    "        #remove the reco element from the list, so it can't be associated to anything else\n",
+    "        dg.remove_node(reco_obj)\n",
+    "    \n",
+    "    #this genparticle had a unique reco element\n",
     "    if reco_obj:\n",
     "        pf_obj = None\n",
     "        if reco_obj and reco_obj in reco_to_pf:\n",
@@ -180,8 +228,11 @@
     "\n",
     "        assert(not (reco_obj in pairs))\n",
     "        pairs[reco_obj] = (gp, pf_obj)\n",
+    "        \n",
+    "    #this is a case where a genparticle did not have a key reco element, but instead was smeared between others\n",
     "    else:\n",
-    "        print(\"genparticle {} is merged and cannot be reconstructed\".format(gp))"
+    "        print(\"genparticle {} is merged and cannot be reconstructed\".format(gp))\n",
+    "        print(df_gen.loc[gp])"
    ]
   },
   {
@@ -201,27 +252,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def track_as_array(itr):\n",
+    "def track_as_array(df_tr, itr):\n",
     "    row = df_tr.loc[itr]\n",
     "    return [0, row[\"px\"], row[\"py\"], row[\"pz\"], row[\"nhits\"], row[\"d0\"], row[\"z0\"]]\n",
     "\n",
-    "def cluster_as_array(icl):\n",
+    "def cluster_as_array(df_cl, icl):\n",
     "    row = df_cl.loc[icl]\n",
     "    return [1, row[\"x\"], row[\"y\"], row[\"z\"], row[\"nhits_ecal\"], row[\"nhits_hcal\"], 0.0]\n",
     "\n",
-    "def gen_as_array(igen):\n",
+    "def gen_as_array(df_gen, igen):\n",
     "    if igen:\n",
     "        row = df_gen.loc[igen]\n",
-    "        return np.array([row[\"pdgid\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
+    "        return np.array([abs(row[\"pdgid\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
     "    else:\n",
-    "        return np.zeros(5)\n",
+    "        return np.zeros(6)\n",
     "    \n",
-    "def pf_as_array(igen):\n",
+    "def pf_as_array(df_pfs, igen):\n",
     "    if igen:\n",
     "        row = df_pfs.loc[igen]\n",
-    "        return np.array([row[\"type\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
+    "        return np.array([abs(row[\"type\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
     "    else:\n",
-    "        return np.zeros(5)"
+    "        return np.zeros(6)"
    ]
   },
   {
@@ -231,37 +282,42 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "Xs = []\n",
-    "ys_gen = []\n",
-    "ys_cand = []\n",
-    "for itr in range(len(df_tr)):\n",
-    "    Xs.append(track_as_array(itr))\n",
+    "def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs):\n",
+    "    Xs = []\n",
+    "    ys_gen = []\n",
+    "    ys_cand = []\n",
     "    \n",
-    "    k = (\"tr\", itr)\n",
-    "    gp = None\n",
-    "    rp = None\n",
-    "    if k in pairs:\n",
-    "        gp = pairs[k][0]\n",
-    "        rp = pairs[k][1]\n",
-    "    ys_gen.append(gen_as_array(gp))\n",
-    "    ys_cand.append(pf_as_array(rp))\n",
+    "    #find all track-associated particles\n",
+    "    for itr in range(len(df_tr)):\n",
+    "        Xs.append(track_as_array(df_tr, itr))\n",
     "\n",
+    "        k = (\"tr\", itr)\n",
+    "        gp = None\n",
+    "        rp = None\n",
+    "        if k in pairs:\n",
+    "            gp = pairs[k][0]\n",
+    "            rp = pairs[k][1]\n",
+    "        ys_gen.append(gen_as_array(df_gen, gp))\n",
+    "        ys_cand.append(pf_as_array(df_pfs, rp))\n",
     "    \n",
-    "for icl in range(len(df_cl)):\n",
-    "    Xs.append(cluster_as_array(icl))\n",
-    "    \n",
-    "    k = (\"cl\", icl)\n",
-    "    gp = None\n",
-    "    rp = None\n",
-    "    if k in pairs:\n",
-    "        gp = pairs[k][0]\n",
-    "        rp = pairs[k][1]\n",
-    "    ys_gen.append(gen_as_array(gp))\n",
-    "    ys_cand.append(pf_as_array(rp))\n",
+    "    #find all cluster-associated particles\n",
+    "    for icl in range(len(df_cl)):\n",
+    "        Xs.append(cluster_as_array(df_cl, icl))\n",
+    "\n",
+    "        k = (\"cl\", icl)\n",
+    "        gp = None\n",
+    "        rp = None\n",
+    "        if k in pairs:\n",
+    "            gp = pairs[k][0]\n",
+    "            rp = pairs[k][1]\n",
+    "        ys_gen.append(gen_as_array(df_gen, gp))\n",
+    "        ys_cand.append(pf_as_array(df_pfs, rp))\n",
+    "\n",
+    "    Xs = np.stack(Xs, axis=-1).T\n",
+    "    ys_gen = np.stack(ys_gen, axis=-1).T\n",
+    "    ys_cand = np.stack(ys_cand, axis=-1).T\n",
     "    \n",
-    "Xs = np.stack(Xs, axis=-1).T\n",
-    "ys_gen = np.stack(ys_gen, axis=-1).T\n",
-    "ys_cand = np.stack(ys_cand, axis=-1).T"
+    "    return Xs, ys_gen, ys_cand"
    ]
   },
   {
@@ -271,58 +327,80 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(Xs)\n",
-    "i = 106"
+    "Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs)\n",
+    "len(Xs), len(ys_gen), len(ys_cand)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "mexican-immune",
+   "id": "c022fce0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "Xs[i]"
+    "import sklearn\n",
+    "import sklearn.metrics"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fossil-cornell",
+   "id": "16dde9e2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_gen[i]"
+    "np.unique(ys_gen[:, 0])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "medium-armor",
+   "id": "012ef075",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_cand[i]"
+    "np.unique(ys_cand[:, 0])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "confident-publisher",
+   "id": "e9c5b8cd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_gen[:, 0]"
+    "labels = [0,   13,   11, 22,  130,  211,  321, 2112, 2212]\n",
+    "labels_text = {\n",
+    "    0: \"none\",\n",
+    "    13: \"mu\",\n",
+    "    11: \"el\",\n",
+    "    22: \"$\\gamma$\",\n",
+    "    130: \"$K^0_L$\",\n",
+    "    211: \"$\\pi^\\pm$\",\n",
+    "    321: \"$K^+$\",\n",
+    "    2112: \"n\",\n",
+    "    2212: \"p\"\n",
+    "}\n",
+    "cm = sklearn.metrics.confusion_matrix(\n",
+    "    ys_gen[:, 0],\n",
+    "    ys_cand[:, 0],\n",
+    "    labels=labels,\n",
+    "    normalize=\"true\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cardiovascular-majority",
+   "id": "8817f3e5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_cand[:, 0]"
+    "plt.imshow(cm, cmap=\"Blues\")\n",
+    "plt.xticks(range(len(labels)), [labels_text[l] for l in labels], rotation=90);\n",
+    "plt.yticks(range(len(labels)), [labels_text[l] for l in labels]);\n",
+    "plt.xlabel(\"reco\")\n",
+    "plt.ylabel(\"gen\")"
    ]
   },
   {
@@ -461,7 +539,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -475,7 +553,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index 9b0cf0907..d2b76df1e 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -3,6 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "impressive-ethiopia",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -16,7 +17,7 @@
     "import sklearn.metrics\n",
     "import matplotlib\n",
     "import scipy\n",
-    "import mplhep as hep\n",
+    "import mplhep\n",
     "\n",
     "import pandas"
    ]
@@ -24,6 +25,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "statistical-ordering",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,6 +36,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "visible-destruction",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,6 +52,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "undefined-judges",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -80,10 +84,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "respective-theater",
    "metadata": {},
    "outputs": [],
    "source": [
     "pid_names = {\n",
+    "    0: \"no ptcl\",\n",
     "    1: \"ch.had\",\n",
     "    2: \"n.had\",\n",
     "    3: \"HFEM\",\n",
@@ -94,6 +100,7 @@
     "}\n",
     "\n",
     "pid_names_long = {\n",
+    "    0: \"no particle\",\n",
     "    1: \"charged hadrons\",\n",
     "    2: \"neutral hadrons\",\n",
     "    3: \"HFEM\",\n",
@@ -115,21 +122,24 @@
     "x_labels = [\n",
     "    \"track\", \"PS1\", \"PS2\", \"ECAL\", \"HCAL\", \"GSF\", \"BREM\", \"HFEM\", \"HFHAD\", \"SC\", \"HO\"\n",
     "]\n",
-    "y_labels = [pid_names[i] for i in range(1,8)]"
+    "y_labels = [pid_names[i] for i in range(0,8)]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "stone-spanking",
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms-gnn-dense-a301aa09.gpu0.local/\""
+    "#path = \"../experiments/cms-dev_20210831_225815_541048.gpu0.local/evaluation/\"\n",
+    "path = \"../experiments/cms-gen_20210903_114315_805349.joosep-desktop-work/evaluation/\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "blind-promotion",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -138,7 +148,7 @@
     "ycands = []\n",
     "ypreds = []\n",
     "ypreds_raw = []\n",
-    "for fi in glob.glob(path + \"/pred_batch*.npz\")[:100]:\n",
+    "for fi in glob.glob(path + \"/pred_batch*.npz\"):\n",
     "    dd = np.load(fi)\n",
     "    Xs.append(dd[\"X\"])\n",
     "    ygens.append(dd[\"ygen\"])\n",
@@ -152,6 +162,7 @@
     "ygen = np.concatenate(ygens)\n",
     "ycand = np.concatenate(ycands)\n",
     "ypred = np.concatenate(ypreds)\n",
+    "\n",
     "ypred_raw = np.concatenate(ypreds_raw)\n",
     "\n",
     "X_f = X.reshape((X.shape[0]*X.shape[1], X.shape[2]))\n",
@@ -159,85 +170,95 @@
     "ygen_f = ygen.reshape((ygen.shape[0]*ygen.shape[1], ygen.shape[2]))\n",
     "ycand_f = ycand.reshape((ycand.shape[0]*ycand.shape[1], ycand.shape[2]))\n",
     "ypred_f = ypred.reshape((ypred.shape[0]*ypred.shape[1], ypred.shape[2]))\n",
-    "ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "glob_iter = 0\n",
-    "def multiplicity_score(thresholds):\n",
-    "    global glob_iter\n",
-    "    ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
-    "    total_scores = []\n",
-    "    for icls in range(1,8):\n",
-    "        ntrue = np.sum((ycand[:, :, 0]==icls)*msk_X, axis=1)\n",
-    "        npred = np.sum((ypred_id==icls)*msk_X, axis=1)\n",
-    "        diff = np.sqrt(np.sum((ntrue-npred)**2))/np.mean(ntrue)\n",
-    "        total_scores.append(diff)\n",
-    "        #print(\"  \", icls, np.mean(ntrue), np.mean(npred), diff)\n",
-    "    glob_iter += 1\n",
-    "    if glob_iter%10 == 0:\n",
-    "        print(glob_iter, np.sum(total_scores))\n",
-    "        print(\",\\t\".join([\"{:.2f}\".format(x) for x in thresholds]))\n",
-    "        print(\",\\t\".join([\"{:.2f}\".format(x) for x in total_scores]))\n",
-    "    return np.sum(total_scores)\n",
-    "\n",
-    "ret = scipy.optimize.minimize(\n",
-    "    multiplicity_score,\n",
-    "    0.5*np.ones(7),\n",
-    "    tol=1e-5,\n",
-    "    method=\"Powell\",\n",
-    "    bounds=[(0,1) for i in range(7)],\n",
-    "    #options={\"ftol\": 1e-6, \"disp\":True}\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "thresholds = ret.x"
+    "\n",
+    "# ypred_raw[X[:, :, 0]==1, 6] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==4, 1] = 0.0\n",
+    "# #ypred_raw[X[:, :, 0]==4, 6] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==5, 0] += ypred_raw[X[:, :, 0]==5, 1]\n",
+    "# ypred_raw[X[:, :, 0]==5, 0] += ypred_raw[X[:, :, 0]==5, 7]\n",
+    "# ypred_raw[X[:, :, 0]==5, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==5, 7] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==8, 3] += ypred_raw[X[:, :, 0]==8, 1]\n",
+    "# ypred_raw[X[:, :, 0]==8, 3] += ypred_raw[X[:, :, 0]==8, 2]\n",
+    "# ypred_raw[X[:, :, 0]==8, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==8, 2] = 0.0\n",
+    "\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==9, 3] += ypred_raw[X[:, :, 0]==9, 1]\n",
+    "# ypred_raw[X[:, :, 0]==9, 3] += ypred_raw[X[:, :, 0]==9, 2]\n",
+    "# ypred_raw[X[:, :, 0]==9, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==9, 2] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==9, 1] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==8, 2] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==9, 2] = 0.0\n",
+    "\n",
+    "ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))\n",
+    "\n",
+    "ypred_id = np.argmax(ypred_raw, axis=-1)\n",
+    "ypred_id_f = ypred_id.flatten()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "extensive-kuwait",
    "metadata": {},
    "outputs": [],
    "source": [
+    "thresholds = [0.0, 0.0, 0.0, 0, 0, 0, 0]\n",
     "ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
-    "ypred_id_f =  apply_thresholds_f(ypred_raw_f, thresholds)"
+    "ypred_id_f = apply_thresholds_f(ypred_raw_f, thresholds)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "interim-chosen",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.balanced_accuracy_score(ycand_f[msk_X_f, 0], ypred_f[:, 0][msk_X_f])"
+    "icls = 2\n",
+    "ielem = 5\n",
+    "\n",
+    "energy_msk = (X_f[:, 4]>0)\n",
+    "elem_msk = (X_f[:, 0]==ielem)\n",
+    "\n",
+    "vals_sig = ypred_raw_f[energy_msk & elem_msk & (ycand_f[:, 0]==icls), icls]\n",
+    "vals_bkg = ypred_raw_f[energy_msk & elem_msk & (ycand_f[:, 0]!=icls), icls]\n",
+    "\n",
+    "bins = np.linspace(0,1,100)\n",
+    "hsig = np.histogram(vals_sig, bins=bins)[0]\n",
+    "hbkg = np.histogram(vals_bkg, bins=bins)[0]\n",
+    "\n",
+    "a = np.cumsum(hsig)/np.sum(hsig)\n",
+    "b = np.cumsum(hbkg)/np.sum(hbkg)\n",
+    "\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.plot(a, b, marker=\".\")\n",
+    "plt.plot([0,1], [0,1], color=\"black\", lw=0.5, ls=\"--\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "becoming-application",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.balanced_accuracy_score(ycand_f[msk_X_f, 0], ypred_id_f[msk_X_f])"
+    "b = np.linspace(0,1,100)\n",
+    "mplhep.histplot(np.histogram(vals_sig, bins=b, density=1), label=\"sig\");\n",
+    "mplhep.histplot(np.histogram(vals_bkg, bins=b, density=1), label=\"bkg\");\n",
+    "plt.legend(loc=2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "virgin-nicaragua",
    "metadata": {
     "scrolled": false
    },
@@ -263,6 +284,337 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "funky-destination",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "energy_bins_classwise = {\n",
+    "    1: [-2, 5],\n",
+    "    2: [-2, 6],\n",
+    "    3: [1, 7],\n",
+    "    4: [2, 5],\n",
+    "    5: [2, 5],\n",
+    "    6: [2, 5],\n",
+    "    7: [2, 5],\n",
+    "}\n",
+    "\n",
+    "energy_correction_factors = {\n",
+    "    1: [1, 1],\n",
+    "    2: [1, 1],\n",
+    "    3: [1.0, 1.2],\n",
+    "    4: [1, 1],\n",
+    "    5: [1, 1],\n",
+    "    6: [1, 1],\n",
+    "    7: [1, 1],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "authorized-greensboro",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==0), 1], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==1), 1], bins=b, histtype=\"step\", lw=2, label=\"charged PFCandidate\", density=True);\n",
+    "plt.legend(loc=2, frameon=False)\n",
+    "plt.xlabel(\"Charged hadron probability\")\n",
+    "plt.title(\"Tracks\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "incorporate-vanilla",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==0), 0], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==1), 0], bins=b, histtype=\"step\", lw=2, label=\"charged PFCandidate\", density=True);\n",
+    "plt.legend(loc=1, frameon=False)\n",
+    "plt.xlabel(\"No particle probability\")\n",
+    "plt.title(\"Tracks\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "comic-privacy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==0), 2], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==2), 2], bins=b, histtype=\"step\", lw=2, label=\"neutral PFCandidate\", density=True);\n",
+    "plt.legend(loc=2, frameon=False)\n",
+    "plt.xlabel(\"Neutral probability\")\n",
+    "plt.title(\"HCAL clusters\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sustainable-passage",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==0), 0], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==2), 0], bins=b, histtype=\"step\", lw=2, label=\"neutral PFCandidate\", density=True);\n",
+    "plt.legend(loc=\"best\", frameon=False)\n",
+    "plt.xlabel(\"No particle probability\")\n",
+    "plt.title(\"HCAL clusters\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "funny-batch",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "elem_type = 5\n",
+    "icls = 2\n",
+    "\n",
+    "def plot_elem_energy_cls_prob(elem_type):\n",
+    "    plt.figure(figsize=(4*5,2*4))\n",
+    "    plt.suptitle(\"PFElement type {}\".format(elem_type))\n",
+    "    \n",
+    "    for icls in range(8):\n",
+    "        plt.subplot(2,4,icls+1)\n",
+    "        plt.hist2d(\n",
+    "            np.log10(X_f[X_f[:, 0]==elem_type, 4]),\n",
+    "            ypred_raw_f[X_f[:, 0]==elem_type, icls],\n",
+    "            bins=(np.linspace(-2,4,100), np.linspace(0,1,100)), cmap=\"Blues\");\n",
+    "        plt.colorbar()\n",
+    "        plt.xlabel(\"PFElement log[E/GeV]\")\n",
+    "        plt.ylabel(\"MLPF probability for class {}\".format(icls))\n",
+    "    plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "strange-combine",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "private-communication",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "differential-steal",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "utility-beverage",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "moderate-india",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "direct-crowd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reco_label = X_f[X_f[:, 0]!=0, 0]\n",
+    "cand_label = ycand_f[X_f[:, 0]!=0, 0]\n",
+    "pred_label = ypred_id_f[X_f[:, 0]!=0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fitting-thriller",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cm1 = sklearn.metrics.confusion_matrix(reco_label, cand_label, labels=range(12))\n",
+    "cm2 = sklearn.metrics.confusion_matrix(reco_label, pred_label, labels=range(12))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "frozen-ethnic",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(cm1, cmap=\"Blues\", norm=matplotlib.colors.LogNorm())\n",
+    "plt.xticks(range(12));\n",
+    "plt.yticks(range(12));\n",
+    "plt.colorbar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "anticipated-robinson",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(cm2, cmap=\"Blues\", norm=matplotlib.colors.LogNorm())\n",
+    "plt.xticks(range(12));\n",
+    "plt.yticks(range(12));\n",
+    "plt.colorbar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "micro-saying",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ycand_id_f = ycand_f[:, 0]\n",
+    "\n",
+    "b = np.linspace(-3,6,100)\n",
+    "\n",
+    "icls = 2\n",
+    "\n",
+    "def plot_eff_and_fake_rate(\n",
+    "    icls,\n",
+    "    ivar=4,\n",
+    "    bins=np.linspace(-3,6,100),\n",
+    "    xlabel=\"PFElement log[E/GeV]\", log=True\n",
+    "    ):\n",
+    "    \n",
+    "    values = X_f[:, ivar]\n",
+    "    if log:\n",
+    "        values = np.log(values)\n",
+    "        \n",
+    "    hist_cand = np.histogram(values[(ycand_id_f==icls)], bins=bins);\n",
+    "    hist_cand_true = np.histogram(values[(ycand_id_f==icls) & (ypred_id_f==icls)], bins=bins);\n",
+    "\n",
+    "    hist_pred = np.histogram(values[(ypred_id_f==icls)], bins=bins);\n",
+    "    hist_pred_fake = np.histogram(values[(ycand_id_f!=icls) & (ypred_id_f==icls)], bins=bins);\n",
+    "\n",
+    "    eff = hist_cand_true[0]/hist_cand[0]\n",
+    "    fake = hist_pred_fake[0]/hist_pred[0]\n",
+    "\n",
+    "    plt.figure(figsize=(8,8))\n",
+    "    ax1 = plt.subplot(2,1,1)\n",
+    "    mplhep.histplot(hist_cand, label=\"with PF candidate\")\n",
+    "    mplhep.histplot(hist_pred, label=\"with MLPF candidate\")\n",
+    "    plt.legend(frameon=False)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(\"Number of particles\")\n",
+    "\n",
+    "    ax2 = plt.subplot(2,1,2, sharex=ax1)\n",
+    "    mplhep.histplot(eff, bins=hist_cand[1], label=\"efficiency\", color=\"black\")\n",
+    "    mplhep.histplot(fake, bins=hist_cand[1], label=\"fake rate\", color=\"red\")\n",
+    "    plt.legend(frameon=False)\n",
+    "    plt.ylim(0,1.4)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(\"Fraction of particles / bin\")\n",
+    "    \n",
+    "    return ax1, ax2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "inner-christianity",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = np.linspace(0,100, 100)\n",
+    "plt.hist(X_f[(X_f[:, 0]==5), 4], bins=b, histtype=\"step\", lw=2, label=\"all clusters\");\n",
+    "plt.hist(X_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==2), 4], bins=b, histtype=\"step\", lw=2, label=\"with PF candidate\");\n",
+    "plt.hist(X_f[(X_f[:, 0]==5) & (ypred_id_f==2), 4], bins=b, histtype=\"step\", lw=2, label=\"with MLPF candidate\");\n",
+    "plt.yscale(\"log\")\n",
+    "plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "automated-quarter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax1, ax2 = plot_eff_and_fake_rate(1, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")\n",
+    "ax1.set_title(\"track, charged hadron predictions\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "military-professor",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax1, ax2 = plot_eff_and_fake_rate(2, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")\n",
+    "ax1.set_title(\"HCAL cluster, neutral hadron predictions\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "characteristic-colleague",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax1, ax2 = plot_eff_and_fake_rate(3, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "composed-principal",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax1, ax2 = plot_eff_and_fake_rate(4, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ready-macedonia",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,28 +636,35 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "formal-county",
    "metadata": {},
    "outputs": [],
    "source": [
-    "history = load_history(path + \"/history_*.json\")"
+    "history = load_history(path + \"/../history/history_*.json\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "neural-witch",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def loss_plot(train, test, margin=0.05):\n",
+    "def loss_plot(train, test, margin=0.05, smoothing=False):\n",
     "    fig = plt.figure(figsize=(8,4))\n",
     "    ax = plt.axes()\n",
-    "    p0 = plt.plot(train, alpha=0.2)\n",
-    "    p1 = plt.plot(test, alpha=0.2)\n",
     "    \n",
-    "    train_smooth = np.convolve(train, np.ones(5)/5, mode='valid')\n",
-    "    plt.plot(train_smooth, color=p0[0].get_color(), lw=2, label=\"train\")\n",
-    "    test_smooth = np.convolve(test, np.ones(5)/5, mode='valid')\n",
-    "    plt.plot(test_smooth, color=p1[0].get_color(), lw=2, label=\"test\")\n",
+    "    alpha = 0.2 if smoothing else 1.0\n",
+    "    l0 = None if smoothing else \"train\"\n",
+    "    l1 = None if smoothing else \"test\"\n",
+    "    p0 = plt.plot(train, alpha=alpha, label=l0)\n",
+    "    p1 = plt.plot(test, alpha=alpha, label=l1)\n",
+    "    \n",
+    "    if smoothing:\n",
+    "        train_smooth = np.convolve(train, np.ones(5)/5, mode='valid')\n",
+    "        plt.plot(train_smooth, color=p0[0].get_color(), lw=2, label=\"train\")\n",
+    "        test_smooth = np.convolve(test, np.ones(5)/5, mode='valid')\n",
+    "        plt.plot(test_smooth, color=p1[0].get_color(), lw=2, label=\"test\")\n",
     "    \n",
     "    plt.ylim(test[-1]*(1.0-margin), test[-1]*(1.0+margin))\n",
     "    plt.legend(loc=\"best\", frameon=False)\n",
@@ -317,10 +676,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "formal-maryland",
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"loss\"].values, history[\"val_loss\"].values)\n",
+    "p0 = loss_plot(history[\"loss\"].values, history[\"val_loss\"].values, margin=0.02)\n",
     "plt.ylabel(\"Total loss\")\n",
     "plt.savefig(\"loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -328,6 +688,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "committed-clothing",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -339,10 +700,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "recreational-enhancement",
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"energy_loss\"].values, history[\"val_energy_loss\"].values, margin=0.05)\n",
+    "p0 = loss_plot(history[\"energy_loss\"].values, history[\"val_energy_loss\"].values, margin=0.01)\n",
     "plt.ylabel(\"Energy loss\")\n",
     "plt.savefig(\"energy_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -350,10 +712,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "frank-alberta",
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"pt_loss\"].values, history[\"val_pt_loss\"].values, margin=0.1)\n",
+    "p0 = loss_plot(history[\"pt_loss\"].values, history[\"val_pt_loss\"].values, margin=0.02)\n",
     "plt.ylabel(\"$p_T$ loss\")\n",
     "plt.savefig(\"pt_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -361,10 +724,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "significant-breeding",
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"sin_phi_loss\"].values, history[\"val_sin_phi_loss\"].values, margin=0.01)\n",
+    "p0 = loss_plot(history[\"sin_phi_loss\"].values, history[\"val_sin_phi_loss\"].values, margin=0.02)\n",
     "plt.ylabel(\"$\\sin \\phi$ loss\")\n",
     "plt.savefig(\"sin_phi_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -372,6 +736,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "living-egyptian",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -383,10 +748,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "healthy-willow",
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"eta_loss\"].values, history[\"val_eta_loss\"].values, margin=0.01)\n",
+    "p0 = loss_plot(history[\"eta_loss\"].values, history[\"val_eta_loss\"].values, margin=0.005)\n",
     "plt.ylabel(\"$\\eta$ loss\")\n",
     "plt.savefig(\"eta_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -394,6 +760,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "accomplished-brave",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -405,6 +772,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "august-feeding",
    "metadata": {
     "scrolled": false
    },
@@ -416,8 +784,6 @@
     "    msk = (ycand_f[:, 0] == icls)\n",
     "    plt.hist(ypred_raw_f[msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype=\"step\", lw=2, color=\"blue\", label=\"true \"+pid_names[icls]);\n",
     "    plt.hist(ypred_raw_f[~msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype=\"step\", lw=2, color=\"red\", label=\"other particles\");\n",
-    "    #plt.axvline(ret.x[icls-1], 0, 0.7, ls=\"--\",\n",
-    "    #    color=\"black\", label=\"threshold: {:.2f}\".format(ret.x[icls-1]), lw=1)\n",
     "    plt.yscale(\"log\")\n",
     "    plt.title(\"Particle reconstruction for {}\".format(pid_names[icls]), y=1.05)\n",
     "    plt.xlabel(\"Classification output {}\".format(icls))\n",
@@ -431,28 +797,30 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "simple-forestry",
    "metadata": {},
    "outputs": [],
    "source": [
     "#perm = np.random.permutation(ycand_f[msk_X].shape[0])[:100000]\n",
     "\n",
     "cm_norm = sklearn.metrics.confusion_matrix(\n",
-    "    ycand_f[msk_X_f & (ycand_f[:, 0]!=0), 0],\n",
-    "    ypred_id_f[msk_X_f & (ycand_f[:, 0]!=0)],\n",
-    "    labels=range(1,8),\n",
+    "    ycand_f[msk_X_f, 0],\n",
+    "    ypred_id_f[msk_X_f],\n",
+    "    labels=range(0,8),\n",
     "    normalize=\"true\"\n",
     ")\n",
     "\n",
     "cm = sklearn.metrics.confusion_matrix(\n",
-    "    ycand_f[msk_X_f & (ycand_f[:, 0]!=0), 0],\n",
-    "    ypred_id_f[msk_X_f & (ycand_f[:, 0]!=0)],\n",
-    "    labels=range(1,8),\n",
+    "    ycand_f[msk_X_f, 0],\n",
+    "    ypred_id_f[msk_X_f],\n",
+    "    labels=range(0,8),\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "empirical-network",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -475,6 +843,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "prepared-fruit",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -496,6 +865,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "civilian-diving",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -511,6 +881,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "expressed-samba",
    "metadata": {
     "scrolled": false
    },
@@ -536,60 +907,224 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "paperback-timeline",
    "metadata": {},
    "outputs": [],
    "source": [
-    "fig, axes = plt.subplots(7, 6, figsize=(6*6,7*5))\n",
-    "\n",
-    "for axs, icls in zip(axes, range(1,8)):    \n",
-    "    axes = axs.flatten()\n",
+    "def plot_particle_regression(ivar=6, icls=2, particle_label=\"Neutral hadrons\", log=True, minval=-1, maxval=3, norm=matplotlib.colors.LogNorm()):\n",
+    "    plt.figure(figsize=(6,5))\n",
+    "    ax = plt.axes()\n",
     "    \n",
-    "    npred = np.sum(ypred_id == icls, axis=1)\n",
-    "    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)\n",
-    "    ngen = np.sum(ygen[:, :, 0] == icls, axis=1)\n",
     "    \n",
-    "    a = 0.5*min(np.min(npred), np.min(ncand))\n",
-    "    b = 1.5*max(np.max(npred), np.max(ncand))\n",
+    "    bins = np.linspace(minval, maxval, 100)\n",
+    "    msk_both = (ypred_id_f == icls) & (ycand_f[:, 0]==icls)\n",
     "    \n",
-    "    axes[0].scatter(ncand, npred, marker=\".\")\n",
+    "    vals_true = ycand_f[msk_both, ivar]\n",
+    "    vals_pred = ypred_f[msk_both, ivar]\n",
     "    \n",
-    "    axes[0].set_xlim(a,b)\n",
-    "    axes[0].set_ylim(a,b)\n",
-    "    axes[0].plot([a,b],[a,b], color=\"black\", ls=\"--\")\n",
-    "    axes[0].set_title(pid_names[icls])\n",
-    "    axes[0].set_xlabel(\"number of PFCandidates\")\n",
-    "    axes[0].set_ylabel(\"number of MLPFCandidates\")\n",
+    "    if log:\n",
+    "        vals_true = np.log10(vals_true)\n",
+    "        vals_pred = np.log10(vals_pred)\n",
     "    \n",
-    "    msk_both = (ycand_f[:, 0]==icls) & (ypred_id_f==icls)\n",
-    "    print(icls, np.sum(msk_both))\n",
+    "    plt.hist2d(\n",
+    "        vals_true,\n",
+    "        vals_pred,\n",
+    "        bins=(bins, bins),\n",
+    "        cmap=\"Blues\", norm=norm\n",
+    "    )\n",
+    "    \n",
+    "    plt.colorbar()\n",
+    "    plt.plot([minval, maxval], [minval, maxval], color=\"black\", ls=\"--\", lw=0.5)\n",
+    "    plt.xlim(minval, maxval)\n",
+    "    plt.ylim(minval, maxval)\n",
+    "    cms_label(x1=0.2, x2=0.48)\n",
+    "    plt.text(0.02, 0.95, particle_label, transform=ax.transAxes)\n",
+    "    ax.set_xticks(ax.get_yticks());"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ecological-toner",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=1, particle_label=\"Charged hadrons\")\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.savefig(\"energy_corr_cls1.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "transparent-remedy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=2, particle_label=\"Neutral hadrons\")\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.savefig(\"energy_corr_cls2.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "promotional-checklist",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=3, icls=1, particle_label=\"Charged hadrons\", log=False, minval=-4, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\eta$\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\eta$\")\n",
+    "plt.savefig(\"eta_corr_cls1.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "suitable-kansas",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=3, icls=2, particle_label=\"Neutral hadrons\", log=False, minval=-4, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\eta$\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\eta$\")\n",
+    "plt.savefig(\"eta_corr_cls2.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "restricted-million",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=3, particle_label=\"HF\", minval=0.0, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "raising-first",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=4, particle_label=\"HF\", minval=0.0, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a3ab75a",
+   "metadata": {},
+   "source": [
+    "## Gen level"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "700c7700",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "for icls in range(1,8):\n",
+    "    npred = np.sum(ypred_id == icls, axis=1)\n",
+    "    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)\n",
+    "    ngen = np.sum(ygen[:, :, 0] == icls, axis=1)\n",
+    "    plt.figure(figsize=(6,6))\n",
+    "    plt.scatter(ngen, ncand, marker=\".\", alpha=0.5, label=\"PF\")\n",
+    "    plt.scatter(ngen, npred, marker=\".\", alpha=0.5, label=\"MLPF\")\n",
+    "    plt.legend(loc=\"best\", frameon=False)\n",
+    "    a = 0.5*min(np.min(ngen), np.min(ngen))\n",
+    "    b = 2*max(np.max(ngen), np.max(ngen))\n",
+    "    plt.xlim(a,b)\n",
+    "    plt.ylim(a,b)\n",
+    "    plt.plot([a,b],[a,b], color=\"black\", ls=\"--\")\n",
+    "    plt.title(pid_names_long[icls],y=1.05)\n",
+    "    plt.xlabel(\"number of gen particles\")\n",
+    "    plt.ylabel(\"number of PFCandidates\")\n",
+    "    cms_label(x2=0.6, y=0.89)\n",
+    "#     plt.savefig(\"num_cls{}.pdf\".format(icls))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5661ff16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = np.linspace(0,500,100)\n",
+    "mplhep.histplot(np.histogram(ygen_f[ygen_f[:, 0]==2, 6], bins=bins))\n",
+    "mplhep.histplot(np.histogram(ycand_f[ycand_f[:, 0]==2, 6], bins=bins))\n",
+    "mplhep.histplot(np.histogram(ypred_f[ypred_f[:, 0]==2, 6], bins=bins))\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82f29ef8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icls = 4\n",
+    "bins = np.linspace(-200,200,100)\n",
+    "particle_label = \"neutral hadrons\"\n",
     "\n",
-    "    for ivar, ax in zip([2,3,4,5,6], axes[1:]):\n",
-    "        \n",
-    "#         hist = np.histogram2d(\n",
-    "#             ycand_f[msk_both, ivar],\n",
-    "#             ypred_f[msk_both, ivar], bins=(bins[ivar], bins[ivar])\n",
-    "#         )\n",
-    "#         norm = matplotlib.colors.Normalize(vmin=0, vmax=max(10, np.max(hist[0])))\n",
-    "#         if ivar == 2 or ivar == 6:\n",
-    "#             norm =  matplotlib.colors.LogNorm(vmin=1, vmax=max(10, 10*np.max(hist[0])))\n",
-    "#         hep.hist2dplot(\n",
-    "#             hist, cmap=\"Blues\",\n",
-    "#             norm=norm,\n",
-    "#             ax=ax\n",
-    "#         )\n",
-    "        ax.scatter(ycand_f[msk_both, ivar], ypred_f[msk_both, ivar], marker=\".\", alpha=0.2)\n",
-    "        ax.plot([bins[ivar][0],bins[ivar][-1]], [bins[ivar][0], bins[ivar][-1]], color=\"black\", ls=\"--\")\n",
-    "        ax.set_title(\"pred. {}, {}\".format(pid_names[icls], var_names[ivar]))\n",
-    "        ax.set_xlabel(\"true value (PFCandidate)\")\n",
-    "        ax.set_ylabel(\"reconstructed value (MLPF)\")\n",
-    "plt.tight_layout()\n",
-    "plt.savefig(\"full_performance.png\", bbox_inches=\"tight\", dpi=400)"
+    "msk_cand = (ygen_f[:, 0]==icls) & (ycand_f[:, 0]==icls)\n",
+    "msk_pred = (ygen_f[:, 0]==icls) & (ypred_f[:, 0]==icls)\n",
+    "\n",
+    "vals_gen1 = ygen_f[msk_cand, 6]\n",
+    "vals_gen2 = ygen_f[msk_pred, 6]\n",
+    "vals_cand = ycand_f[msk_cand, 6]\n",
+    "vals_pred = ypred_f[msk_pred, 6]\n",
+    "\n",
+    "res_cand = vals_gen1 - vals_cand\n",
+    "res_pred = vals_gen2 - vals_pred\n",
+    "\n",
+    "plt.figure(figsize=(5,5))\n",
+    "ax = plt.axes()\n",
+    "plt.hist(\n",
+    "    res_cand,\n",
+    "    bins=bins, histtype=\"step\", lw=2,\n",
+    "    label=\"PF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(res_cand), np.std(res_cand)));\n",
+    "\n",
+    "plt.hist(res_pred,\n",
+    "    bins=bins,\n",
+    "    histtype=\"step\", lw=2,\n",
+    "    label=\"MLPF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(res_pred), np.std(res_pred))\n",
+    ");\n",
+    "\n",
+    "plt.yscale(\"log\")\n",
+    "plt.ylabel(\"Number of particles / bin\")\n",
+    "cms_label(x1=0.21, x2=0.55)\n",
+    "plt.ylim(top=10**9)\n",
+    "plt.text(0.02, 0.95, particle_label, transform=ax.transAxes)\n",
+    "plt.xlabel(\"particle $E_{\\mathrm{gen}} - E_{\\mathrm{reco}}$ [GeV]\")\n",
+    "plt.legend(frameon=False)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "338f50e9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -603,7 +1138,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cmssw.ipynb b/notebooks/cmssw.ipynb
index da95c7bd2..be3f9855c 100644
--- a/notebooks/cmssw.ipynb
+++ b/notebooks/cmssw.ipynb
@@ -231,7 +231,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -245,7 +245,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/delphes-tf-mlpf-quickstart.ipynb b/notebooks/delphes-tf-mlpf-quickstart.ipynb
index beed8fa92..24a81391b 100644
--- a/notebooks/delphes-tf-mlpf-quickstart.ipynb
+++ b/notebooks/delphes-tf-mlpf-quickstart.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "incredible-pressing",
+   "id": "damaged-gentleman",
    "metadata": {},
    "source": [
     "This quickstart notebook allows to test and mess around with the MLPF GNN model in a standalone way. For actual training, we don't use a notebook, please refer to `README.md`.\n",
@@ -18,7 +18,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "comparative-stockholm",
+   "id": "happy-presence",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,13 +26,14 @@
     "import numpy as np\n",
     "import tensorflow as tf\n",
     "import sklearn\n",
+    "import sklearn.metrics\n",
     "import matplotlib.pyplot as plt"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "limited-prisoner",
+   "id": "gentle-prompt",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,19 +44,17 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dominant-thumb",
+   "id": "imported-nightlife",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import tfmodel\n",
-    "import tfmodel.model as mlpf_model\n",
-    "from tfmodel.model_setup import PFNetLoss"
+    "import tfmodel"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "billion-rental",
+   "id": "attached-helen",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +64,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fatal-residence",
+   "id": "enormous-merchant",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -75,7 +74,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "facial-screening",
+   "id": "cloudy-warren",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,7 +85,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "charged-defense",
+   "id": "blessed-noise",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,19 +111,20 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "intellectual-trout",
+   "id": "upset-tractor",
    "metadata": {},
    "outputs": [],
    "source": [
     "#Get the first event\n",
     "input_classes = np.unique(X[:, :, 0].flatten())\n",
-    "output_classes = np.unique(y[:, :, 0].flatten())"
+    "output_classes = np.unique(y[:, :, 0].flatten())\n",
+    "num_output_classes = len(output_classes)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "optimum-automation",
+   "id": "hundred-cosmetic",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -134,7 +134,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "metropolitan-burton",
+   "id": "champion-institute",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,67 +144,167 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "systematic-aquarium",
+   "id": "previous-stranger",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#ygen = (pid, charge, momentum values)\n",
-    "num_momentum_outputs = data[\"ygen\"][0].shape[1] - 2"
+    "def transform_target(y):\n",
+    "    return {\n",
+    "        \"cls\": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),\n",
+    "        \"charge\": y[:, :, 1:2],\n",
+    "        \"pt\": y[:, :, 2:3],\n",
+    "        \"eta\": y[:, :, 3:4],\n",
+    "        \"sin_phi\": y[:, :, 4:5],\n",
+    "        \"cos_phi\": y[:, :, 5:6],\n",
+    "        \"energy\": y[:, :, 6:7],\n",
+    "    }\n",
+    "yt = transform_target(y)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "plain-flooring",
+   "id": "nasty-staff",
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = mlpf_model.PFNet(\n",
+    "from tfmodel.model import PFNetDense"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "optical-trinity",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msk_true_particle = y[:, :, 0]!=0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pleasant-textbook",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.unique(y[msk_true_particle][:, 0], return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acute-southwest",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"pt\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"pt\")\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "elementary-hepatitis",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"eta\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"eta\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "white-enhancement",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"sin_phi\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"sin phi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "appointed-alberta",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"cos_phi\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"cos phi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "variable-appointment",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"energy\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"energy\")\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "steady-stock",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = PFNetDense(\n",
     "    num_input_classes=len(input_classes),\n",
     "    num_output_classes=len(output_classes),\n",
-    "    num_momentum_outputs=num_momentum_outputs,\n",
     "    activation=tf.nn.elu,\n",
+    "    hidden_dim=128,\n",
     "    bin_size=128,\n",
-    "    num_neighbors=16\n",
+    "    input_encoding=\"default\",\n",
+    "    multi_output=True\n",
     ")\n",
     "\n",
-    "#combined multiclass + regression loss\n",
-    "loss = PFNetLoss(\n",
-    "    num_input_classes=len(input_classes),\n",
-    "    num_output_classes=len(output_classes),\n",
-    "    \n",
-    "    #(pt, eta, sin phi, cos phi, E)\n",
-    "    momentum_loss_coefs=[0.001, 1.0, 1.0, 1.0, 0.001]\n",
-    ")\n",
-    "\n",
-    "#temporal weight mode means each input element in the event can get a separate weight\n",
-    "model.compile(loss=loss.my_loss_full, optimizer=\"adam\", sample_weight_mode=\"temporal\")"
+    "# #temporal weight mode means each input element in the event can get a separate weight\n",
+    "model.compile(\n",
+    "    loss={\n",
+    "        \"cls\": tf.keras.losses.CategoricalCrossentropy(from_logits=False),\n",
+    "        \"charge\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"pt\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"energy\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"eta\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"sin_phi\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"cos_phi\": tf.keras.losses.MeanSquaredError()\n",
+    "    },\n",
+    "    optimizer=\"adam\",\n",
+    "    sample_weight_mode=\"temporal\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "patient-rating",
+   "id": "explicit-friendship",
    "metadata": {},
    "outputs": [],
    "source": [
-    "X.shape, y.shape"
+    "model(X[:1])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "egyptian-working",
+   "id": "interim-consciousness",
    "metadata": {},
    "outputs": [],
    "source": [
-    "history = model.fit(X[:80], y[:80], validation_data=(X[80:], y[80:]), batch_size=5, epochs=10)"
+    "model.fit(X, yt, epochs=2, batch_size=5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "shaped-bryan",
+   "id": "healthy-constraint",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -214,55 +314,48 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "passive-sitting",
+   "id": "annoying-fleet",
    "metadata": {},
    "outputs": [],
    "source": [
     "#index of the class prediction output values\n",
     "pred_id_offset = len(output_classes)\n",
-    "ypred_ids_raw = ypred[:, :, :pred_id_offset]\n",
-    "ypred_charge = ypred[:, :, pred_id_offset:pred_id_offset+1]\n",
-    "ypred_momentum = ypred[:, :, pred_id_offset+1:]"
+    "ypred_ids_raw = ypred[\"cls\"]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "virtual-reflection",
+   "id": "filled-suspension",
    "metadata": {},
    "outputs": [],
    "source": [
     "sklearn.metrics.confusion_matrix(\n",
     "    np.argmax(ypred_ids_raw, axis=-1).flatten(),\n",
-    "    y[:, :, 0].flatten(), labels=output_classes\n",
+    "    np.argmax(yt[\"cls\"], axis=-1).flatten(), labels=output_classes\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "distinct-sierra",
+   "id": "valued-better",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#compare the predicted and true charge\n",
-    "np.stack([ypred_charge[:, :, 0].flatten(), y[:, :, 1].flatten()]).T"
+    "msk_particles = (X[:, :, 0]!=0)\n",
+    "plt.scatter(\n",
+    "    ypred[\"eta\"][msk_particles].flatten(),\n",
+    "    yt[\"eta\"][msk_particles].flatten(), marker=\".\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "split-detail",
+   "id": "spiritual-fancy",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "#first two values in the y array are ID anc charge\n",
-    "momentum_offset = 2\n",
-    "\n",
-    "#momentum eta component\n",
-    "imomentum = 1\n",
-    "plt.scatter(ypred_momentum[:, :, imomentum].flatten(), y[:, :, imomentum+momentum_offset].flatten(), marker=\".\")"
-   ]
+   "source": []
   }
  ],
  "metadata": {
@@ -281,7 +374,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/delphes_model_analysis.ipynb b/notebooks/delphes_model_analysis.ipynb
index 8eb936c72..bb5c406a7 100644
--- a/notebooks/delphes_model_analysis.ipynb
+++ b/notebooks/delphes_model_analysis.ipynb
@@ -415,8 +415,11 @@
    "metadata": {},
    "source": [
     "Once the training is done, we can generate the pred.npz file using the following:\n",
+    "\n",
     "```bash\n",
-    "singularity exec --nv ~/HEP-KBFI/singularity/base.simg python3 ../mlpf/tensorflow/delphes_model.py --action validate --weights weights-300-*.hdf5\n",
+    "singularity exec --nv ~/HEP-KBFI/singularity/base.simg python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_20210821_160504.joosep-desktop -e experiments/delphes_20210821_160504.joosep-desktop/evaluation_ttbar -v \"data/pythia8_ttbar/val/tev14_pythia8_ttbar_*.pkl.bz2\"\n",
+    "\n",
+    "singularity exec --nv ~/HEP-KBFI/singularity/base.simg python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_20210821_160504.joosep-desktop -e experiments/delphes_20210821_160504.joosep-desktop/evaluation_qcd -v \"data/pythia8_qcd/val/tev14_pythia8_qcd_*.pkl.bz2\"\n",
     "```"
    ]
   },
@@ -426,6 +429,40 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "def load_many_preds(path):\n",
+    "    Xs = []\n",
+    "    ygens = []\n",
+    "    ycands = []\n",
+    "    ypreds = []\n",
+    "\n",
+    "    for fi in glob.glob(path):\n",
+    "        dd = np.load(fi)\n",
+    "        Xs.append(dd[\"X\"])\n",
+    "        ygens.append(dd[\"ygen\"])\n",
+    "        ycands.append(dd[\"ycand\"])\n",
+    "        ypreds.append(dd[\"ypred\"])\n",
+    "        \n",
+    "    X = np.concatenate(Xs)\n",
+    "    msk_X = X[:, :, 0]!=0\n",
+    "\n",
+    "    ygen = np.concatenate(ygens)\n",
+    "    ycand = np.concatenate(ycands)\n",
+    "    ypred = np.concatenate(ypreds)\n",
+    "\n",
+    "    return X, ygen, ycand, ypred\n",
+    "\n",
+    "# For current model\n",
+    "# X_ttbar, ygen_ttbar, ycand_ttbar, ypred_ttbar = load_many_preds(\"../experiments/delphes_20210821_160504.joosep-desktop/evaluation_ttbar/*.npz\")\n",
+    "# X, ygen, ycand, ypred = load_many_preds(\"../experiments/delphes_20210821_160504.joosep-desktop/evaluation_qcd/*.npz\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For the model from the paper\n",
     "#Load the predictions file from the model (this can take a while, as the file is compressed and pretty large)\n",
     "fi_qcd = np.load(open(\"pred_qcd.npz\", \"rb\"))\n",
     "fi_ttbar = np.load(open(\"pred_ttbar.npz\", \"rb\"))\n",
@@ -458,18 +495,18 @@
    "outputs": [],
    "source": [
     "#Flatten the events\n",
-    "ygen = flatten(ygen)\n",
-    "ycand = flatten(ycand)\n",
-    "ypred = flatten(ypred)\n",
-    "X = flatten(X)\n",
-    "msk_X = X[:, 0] != 0\n",
+    "ygen_f = flatten(ygen)\n",
+    "ycand_f = flatten(ycand)\n",
+    "ypred_f = flatten(ypred)\n",
+    "X_f = flatten(X)\n",
+    "msk_X_f = X_f[:, 0] != 0\n",
     "\n",
     "#Flatten the events\n",
-    "ygen_ttbar = flatten(ygen_ttbar)\n",
-    "ycand_ttbar = flatten(ycand_ttbar)\n",
-    "ypred_ttbar = flatten(ypred_ttbar)\n",
-    "X_ttbar = flatten(X_ttbar)\n",
-    "msk_X_ttbar = X[:, 0] != 0"
+    "ygen_ttbar_f = flatten(ygen_ttbar)\n",
+    "ycand_ttbar_f = flatten(ycand_ttbar)\n",
+    "ypred_ttbar_f = flatten(ypred_ttbar)\n",
+    "X_ttbar_f = flatten(X_ttbar)\n",
+    "msk_X_ttbar_f = X_ttbar_f[:, 0] != 0"
    ]
   },
   {
@@ -478,9 +515,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ygen.shape)\n",
-    "print(ycand.shape)\n",
-    "print(ypred.shape)"
+    "print(ygen_f.shape)\n",
+    "print(ycand_f.shape)\n",
+    "print(ypred_f.shape)\n",
+    "\n",
+    "print(ygen_ttbar_f.shape)\n",
+    "print(ycand_ttbar_f.shape)\n",
+    "print(ypred_ttbar_f.shape)"
    ]
   },
   {
@@ -492,17 +533,17 @@
     "def plot_pt_eta(ygen, legend_title=\"\"):\n",
     "    b = np.linspace(0, 100, 41)\n",
     "\n",
-    "    msk_pid1 = (ygen[:, 0]==1)\n",
-    "    msk_pid2 = (ygen[:, 0]==2)\n",
-    "    msk_pid3 = (ygen[:, 0]==3)\n",
-    "    msk_pid4 = (ygen[:, 0]==4)\n",
-    "    msk_pid5 = (ygen[:, 0]==5)\n",
+    "    msk_pid1 = (ygen_f[:, 0]==1)\n",
+    "    msk_pid2 = (ygen_f[:, 0]==2)\n",
+    "    msk_pid3 = (ygen_f[:, 0]==3)\n",
+    "    msk_pid4 = (ygen_f[:, 0]==4)\n",
+    "    msk_pid5 = (ygen_f[:, 0]==5)\n",
     "\n",
-    "    h1 = np.histogram(ygen[msk_pid1, 2], bins=b)\n",
-    "    h2 = np.histogram(ygen[msk_pid2, 2], bins=b)\n",
-    "    h3 = np.histogram(ygen[msk_pid3, 2], bins=b)\n",
-    "    h4 = np.histogram(ygen[msk_pid4, 2], bins=b)\n",
-    "    h5 = np.histogram(ygen[msk_pid5, 2], bins=b)\n",
+    "    h1 = np.histogram(ygen_f[msk_pid1, 2], bins=b)\n",
+    "    h2 = np.histogram(ygen_f[msk_pid2, 2], bins=b)\n",
+    "    h3 = np.histogram(ygen_f[msk_pid3, 2], bins=b)\n",
+    "    h4 = np.histogram(ygen_f[msk_pid4, 2], bins=b)\n",
+    "    h5 = np.histogram(ygen_f[msk_pid5, 2], bins=b)\n",
     "\n",
     "    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
@@ -519,11 +560,11 @@
     "    ax1.set_ylabel(\"Truth particles\")\n",
     "\n",
     "    b = np.linspace(-8, 8, 41)\n",
-    "    h1 = np.histogram(ygen[msk_pid1, 3], bins=b)\n",
-    "    h2 = np.histogram(ygen[msk_pid2, 3], bins=b)\n",
-    "    h3 = np.histogram(ygen[msk_pid3, 3], bins=b)\n",
-    "    h4 = np.histogram(ygen[msk_pid4, 3], bins=b)\n",
-    "    h5 = np.histogram(ygen[msk_pid5, 3], bins=b)\n",
+    "    h1 = np.histogram(ygen_f[msk_pid1, 3], bins=b)\n",
+    "    h2 = np.histogram(ygen_f[msk_pid2, 3], bins=b)\n",
+    "    h3 = np.histogram(ygen_f[msk_pid3, 3], bins=b)\n",
+    "    h4 = np.histogram(ygen_f[msk_pid4, 3], bins=b)\n",
+    "    h5 = np.histogram(ygen_f[msk_pid5, 3], bins=b)\n",
     "    xs = midpoints(h1[1])\n",
     "    width = np.diff(h1[1])\n",
     "\n",
@@ -628,20 +669,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_num_particles_pid(fi, pid=0, ax=None, legend_title=\"\"):\n",
+    "def plot_num_particles_pid(ygen, ycand, ypred, pid=0, ax=None, legend_title=\"\"):\n",
     "    if not ax:\n",
     "        plt.figure(figsize=(4,4))\n",
     "        ax = plt.axes()\n",
     "    \n",
     "    #compute the number of particles per event\n",
     "    if pid == 0:\n",
-    "        x1 = np.sum(fi[\"ygen\"][:, :, 0]!=pid, axis=1)\n",
-    "        x2 = np.sum(fi[\"ypred\"][:, :, 0]!=pid, axis=1)\n",
-    "        x3 = np.sum(fi[\"ycand\"][:, :, 0]!=pid, axis=1)\n",
+    "        x1 = np.sum(ygen[:, :, 0]!=pid, axis=1)\n",
+    "        x2 = np.sum(ypred[:, :, 0]!=pid, axis=1)\n",
+    "        x3 = np.sum(ycand[:, :, 0]!=pid, axis=1)\n",
     "    else:\n",
-    "        x1 = np.sum(fi[\"ygen\"][:, :, 0]==pid, axis=1)\n",
-    "        x2 = np.sum(fi[\"ypred\"][:, :, 0]==pid, axis=1)\n",
-    "        x3 = np.sum(fi[\"ycand\"][:, :, 0]==pid, axis=1)\n",
+    "        x1 = np.sum(ygen[:, :, 0]==pid, axis=1)\n",
+    "        x2 = np.sum(ypred[:, :, 0]==pid, axis=1)\n",
+    "        x3 = np.sum(ycand[:, :, 0]==pid, axis=1)\n",
     "        \n",
     "    v0 = np.min([np.min(x1), np.min(x2), np.min(x3)])\n",
     "    v1 = np.max([np.max(x1), np.max(x2), np.max(x3)])\n",
@@ -693,8 +734,8 @@
     "        \"x1\": x1, \"x2\": x2, \"x3\": x3}\n",
     "\n",
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
-    "ret_num_particles_ch_had = plot_num_particles_pid(fi_qcd, 1, ax1, legend_title=sample_title_qcd+\"\\n\")\n",
-    "ret_num_particles_n_had = plot_num_particles_pid(fi_qcd, 2, ax2, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ret_num_particles_ch_had = plot_num_particles_pid(ygen, ycand, ypred, 1, ax1, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ret_num_particles_n_had = plot_num_particles_pid(ygen, ycand, ypred, 2, ax2, legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax1)\n",
     "plt.tight_layout()\n",
     "plt.savefig(\"plots/num_particles.pdf\", bbox_inches=\"tight\")\n",
@@ -710,8 +751,8 @@
    "outputs": [],
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
-    "ret_num_particles_ch_had_ttbar = plot_num_particles_pid(fi_ttbar, 1, ax1)\n",
-    "ret_num_particles_n_had_ttbar = plot_num_particles_pid(fi_ttbar, 2, ax2)\n",
+    "ret_num_particles_ch_had_ttbar = plot_num_particles_pid(ygen_ttbar, ycand_ttbar, ypred_ttbar, 1, ax1)\n",
+    "ret_num_particles_n_had_ttbar = plot_num_particles_pid(ygen_ttbar, ycand_ttbar, ypred_ttbar, 2, ax2)\n",
     "sample_string_ttbar(ax1)\n",
     "plt.tight_layout()\n",
     "plt.savefig(\"plots/num_particles_ttbar.pdf\", bbox_inches=\"tight\")\n",
@@ -755,13 +796,13 @@
     "def draw_efficiency_fakerate(ygen, ypred, ycand, pid, var, bins, both=True, legend_title=\"\"):\n",
     "    var_idx = var_indices[var]\n",
     "\n",
-    "    msk_gen = ygen[:, 0]==pid\n",
-    "    msk_pred = ypred[:, 0]==pid\n",
-    "    msk_cand = ycand[:, 0]==pid\n",
+    "    msk_gen = ygen_f[:, 0]==pid\n",
+    "    msk_pred = ypred_f[:, 0]==pid\n",
+    "    msk_cand = ycand_f[:, 0]==pid\n",
     "\n",
-    "    hist_gen = np.histogram(ygen[msk_gen, var_idx], bins=bins);\n",
-    "    hist_cand = np.histogram(ygen[msk_gen & msk_cand, var_idx], bins=bins);\n",
-    "    hist_pred = np.histogram(ygen[msk_gen & msk_pred, var_idx], bins=bins);\n",
+    "    hist_gen = np.histogram(ygen_f[msk_gen, var_idx], bins=bins);\n",
+    "    hist_cand = np.histogram(ygen_f[msk_gen & msk_cand, var_idx], bins=bins);\n",
+    "    hist_pred = np.histogram(ygen_f[msk_gen & msk_pred, var_idx], bins=bins);\n",
     "    \n",
     "    hist_gen = mask_empty(hist_gen)\n",
     "    hist_cand = mask_empty(hist_cand)\n",
@@ -790,10 +831,10 @@
     "    ax1.set_xlabel(var_names[var])\n",
     "    ax1.set_ylabel(\"Efficiency\")\n",
     "\n",
-    "    hist_cand2 = np.histogram(ygen[msk_cand & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
-    "    hist_pred2 = np.histogram(ygen[msk_pred & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
-    "    hist_cand_gen2 = np.histogram(ygen[msk_cand & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
-    "    hist_pred_gen2 = np.histogram(ygen[msk_pred & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_cand2 = np.histogram(ygen_f[msk_cand & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_pred2 = np.histogram(ygen_f[msk_pred & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_cand_gen2 = np.histogram(ygen_f[msk_cand & ~msk_gen & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_pred_gen2 = np.histogram(ygen_f[msk_pred & ~msk_gen & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
     "\n",
     "    hist_cand2 = mask_empty(hist_cand2)\n",
     "    hist_cand_gen2 = mask_empty(hist_cand_gen2)\n",
@@ -878,7 +919,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, \"pt\", np.linspace(0, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ax, _ = draw_efficiency_fakerate(ygen_f, ypred_f, ycand_f, 1, \"pt\", np.linspace(0, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid1_pt.pdf\", bbox_inches=\"tight\")\n",
     "PDF(\"plots/eff_fake_pid1_pt.pdf\", size=(300,300))"
@@ -890,7 +931,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, \"eta\", np.linspace(-3, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ax, _ = draw_efficiency_fakerate(ygen_f, ypred_f, ycand_f, 1, \"eta\", np.linspace(-3, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid1_eta.pdf\", bbox_inches=\"tight\")\n",
     "PDF(\"plots/eff_fake_pid1_eta.pdf\", size=(300,300))"
@@ -903,7 +944,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    2, \"energy\", np.linspace(5, 205, 61), legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid2_energy.pdf\", bbox_inches=\"tight\")\n",
@@ -917,7 +958,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen_ttbar, ypred_ttbar, ycand_ttbar,\n",
+    "    ygen_ttbar_f, ypred_ttbar_f, ycand_ttbar_f,\n",
     "    2, \"energy\", np.linspace(5, 205, 61), legend_title=sample_title_ttbar+\"\\n\")\n",
     "#sample_string_ttbar(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid2_energy_ttbar.pdf\", bbox_inches=\"tight\")\n",
@@ -931,7 +972,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    2, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -946,7 +987,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    3, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -961,7 +1002,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    4, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -976,7 +1017,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    5, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -1042,8 +1083,8 @@
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
-    "res_ch_had_pt = plot_reso(ygen, ypred, ycand, 1, \"pt\", 2, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
-    "res_ch_had_eta = plot_reso(ygen, ypred, ycand, 1, \"eta\", 0.2, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_ch_had_pt = plot_reso(ygen_f, ypred_f, ycand_f, 1, \"pt\", 2, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_ch_had_eta = plot_reso(ygen_f, ypred_f, ycand_f, 1, \"eta\", 0.2, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
     "\n",
     "ax1.set_ylim(100, 10**11)\n",
     "ax2.set_ylim(100, 10**11)\n",
@@ -1061,8 +1102,8 @@
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
-    "res_n_had_e = plot_reso(ygen, ypred, ycand, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
-    "res_n_had_eta = plot_reso(ygen, ypred, ycand, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_n_had_e = plot_reso(ygen_f, ypred_f, ycand_f, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_n_had_eta = plot_reso(ygen_f, ypred_f, ycand_f, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
     "\n",
     "#ax1.set_title(\"Neutral hadrons\")\n",
     "#sample_string_qcd(ax1)\n",
@@ -1081,8 +1122,8 @@
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
-    "plot_reso(ygen_ttbar, ypred_ttbar, ycand_ttbar, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_ttbar+\"\\n\")\n",
-    "plot_reso(ygen_ttbar, ypred_ttbar, ycand_ttbar, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_ttbar+\"\\n\")\n",
+    "plot_reso(ygen_ttbar_f, ypred_ttbar_f, ycand_ttbar_f, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_ttbar+\"\\n\")\n",
+    "plot_reso(ygen_ttbar_f, ypred_ttbar_f, ycand_ttbar_f, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_ttbar+\"\\n\")\n",
     "\n",
     "#ax1.set_title(\"Neutral hadrons\")\n",
     "#sample_string_ttbar(ax1)\n",
@@ -1106,20 +1147,20 @@
    "outputs": [],
    "source": [
     "confusion = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ycand[msk_X, 0], normalize=\"true\"\n",
+    "    ygen_f[msk_X, 0], ycand_f[msk_X, 0], normalize=\"true\"\n",
     ")\n",
     "\n",
     "confusion2 = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ypred[msk_X, 0], normalize=\"true\"\n",
+    "    ygen_f[msk_X, 0], ypred_f[msk_X, 0], normalize=\"true\"\n",
     ")\n",
     "\n",
     "\n",
     "confusion_unnorm = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ycand[msk_X, 0],\n",
+    "    ygen_f[msk_X, 0], ycand_f[msk_X, 0],\n",
     ")\n",
     "\n",
     "confusion2_unnorm = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ypred[msk_X, 0],\n",
+    "    ygen_f[msk_X, 0], ypred_f[msk_X, 0],\n",
     ")"
    ]
   },
@@ -1147,7 +1188,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.accuracy_score(ygen[msk_X, 0], ycand[msk_X, 0])"
+    "sklearn.metrics.accuracy_score(ygen_f[msk_X, 0], ycand_f[msk_X, 0])"
    ]
   },
   {
@@ -1156,7 +1197,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.accuracy_score(ygen[msk_X, 0], ypred[msk_X, 0])"
+    "sklearn.metrics.accuracy_score(ygen_f[msk_X, 0], ypred_f[msk_X, 0])"
    ]
   },
   {
@@ -1259,11 +1300,11 @@
     "\n",
     "axes = axes.flatten()\n",
     "for iax, i in enumerate([1,2,3,4,5]):\n",
-    "    axes[iax].hist(ypred[ypred[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
-    "    axes[iax].hist(ygen[ygen[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
+    "    axes[iax].hist(ypred_f[ypred_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
+    "    axes[iax].hist(ygen_f[ygen_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
     "    #axes[iax].hist(ycand[ycand[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"pink\", ls=\"-\", label=\"QCD PF\");\n",
-    "    axes[iax].hist(ypred_ttbar[ypred_ttbar[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
-    "    axes[iax].hist(ygen_ttbar[ygen_ttbar[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
+    "    axes[iax].hist(ypred_ttbar_f[ypred_ttbar_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
+    "    axes[iax].hist(ygen_ttbar_f[ygen_ttbar_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
     "    #axes[iax].hist(ycand_ttbar[ycand_ttbar[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"cyan\", ls=\"-\", label=r\"$t\\bar{t}$ PF\");\n",
     "    axes[iax].set_yscale(\"log\")\n",
     "    axes[iax].legend(ncol=2)\n",
@@ -1288,11 +1329,11 @@
     "\n",
     "axes = axes.flatten()\n",
     "for iax, i in enumerate([1,2,3,4,5]):\n",
-    "    axes[iax].hist(ypred[ypred[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
-    "    axes[iax].hist(ygen[ygen[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
+    "    axes[iax].hist(ypred_f[ypred_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
+    "    axes[iax].hist(ygen_f[ygen_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
     "    #axes[iax].hist(ycand[ycand[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"pink\", ls=\"-\", label=\"QCD PF\");\n",
-    "    axes[iax].hist(ypred_ttbar[ypred_ttbar[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
-    "    axes[iax].hist(ygen_ttbar[ygen_ttbar[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
+    "    axes[iax].hist(ypred_ttbar_f[ypred_ttbar_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
+    "    axes[iax].hist(ygen_ttbar_f[ygen_ttbar_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
     "    #axes[iax].hist(ycand_ttbar[ycand_ttbar[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"cyan\", ls=\"-\", label=r\"$t\\bar{t}$ PF\");\n",
     "    axes[iax].set_yscale(\"log\")\n",
     "    axes[iax].legend(ncol=2)\n",
@@ -1405,9 +1446,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk_pid_gen = ygen[:, 0]==1\n",
-    "msk_pid_cand = ycand[:, 0]==1\n",
-    "msk_pid_pred = ypred[:, 0]==1"
+    "msk_pid_gen = ygen_f[:, 0]==1\n",
+    "msk_pid_cand = ycand_f[:, 0]==1\n",
+    "msk_pid_pred = ypred_f[:, 0]==1"
    ]
   },
   {
@@ -1416,7 +1457,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.unique(ycand[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
+    "np.unique(ycand_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
    ]
   },
   {
@@ -1443,7 +1484,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.unique(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
+    "np.unique(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
    ]
   },
   {
@@ -1452,8 +1493,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF charged hadron, RBPF no charged hadron\");\n",
-    "plt.hist(X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF & RBPF charged hadron\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF charged hadron, RBPF no charged hadron\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF & RBPF charged hadron\");\n",
     "plt.legend()\n",
     "plt.xlabel(\"track pT\")"
    ]
@@ -1464,9 +1505,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
-    "plt.hist(X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
-    "plt.legend()\n",
+    "plt.hist(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
     "plt.xlabel(\"track eta\")"
    ]
   },
@@ -1476,9 +1516,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
-    "plt.hist(X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
-    "plt.legend()\n",
+    "plt.hist(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
     "plt.xlabel(\"track energy\")"
    ]
   },
@@ -1488,8 +1527,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "a = X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2]\n",
-    "b = ycand[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 3]"
+    "a = X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2]\n",
+    "b = ycand_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 3]"
    ]
   },
   {
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 26cc46ee6..7cb5864fe 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -3,6 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "solved-relations",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,10 +31,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "unavailable-applicant",
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"/home/joosep/particleflow/parameters/cms-gnn-dense.yaml\") as f:\n",
+    "with open(\"/home/joosep/particleflow/parameters/cms.yaml\") as f:\n",
     "    config = yaml.load(f)\n",
     "config[\"setup\"][\"multi_output\"] = True\n",
     "config[\"parameters\"][\"debug\"] = True"
@@ -42,6 +44,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "becoming-district",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,6 +54,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "exact-landing",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,6 +77,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "identified-header",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -80,7 +85,7 @@
     "ygens = []\n",
     "ycands = []\n",
     "\n",
-    "for fi in dataset_def.val_filelist[:1]:\n",
+    "for fi in dataset_def.val_filelist[:2]:\n",
     "    print(fi)\n",
     "    X, ygen, ycand = dataset_def.prepare_data(fi)\n",
     "\n",
@@ -92,43 +97,55 @@
     "ygen_val = np.concatenate(ygens)\n",
     "ycand_val = np.concatenate(ycands)\n",
     "\n",
-    "X_val, ycand_val, _ = dataset_transform(X_val, ycand_val, None)\n"
+    "X_val, ycand_val, _ = dataset_transform(X_val, ycand_val, None)\n",
+    "X_val, ygen_val, _ = dataset_transform(X_val, ygen_val, None)\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "reduced-collar",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "expensive-incidence",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ret = model(X_val[:1])\n",
-    "model.set_trainable_classification()\n",
-    "model.load_weights(\"/home/joosep/particleflow/experiments/cms-gnn-dense-a301aa09.gpu0.local/weights-65-103.547722.hdf5\")\n",
-    "ret = model.predict(X_val, batch_size=10)"
+    "np.std(ycand_val[\"energy\"][np.argmax(ycand_val[\"cls\"], axis=-1)==2].numpy().flatten())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "painful-delight",
    "metadata": {},
    "outputs": [],
    "source": [
-    "x = X_val[0]\n",
-    "msk = x[:, 0] == 8"
+    "plt.hist((ycand_val[\"energy\"][np.argmax(ycand_val[\"cls\"], axis=-1)==2].numpy().flatten()-1/59)/1.3, bins=100);"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "collective-mounting",
    "metadata": {},
    "outputs": [],
    "source": [
-    "preds = model(X_val[:1], training=False)"
+    "ret = model(X_val[:1])\n",
+    "#model.set_trainable_classification()\n",
+    "model.load_weights(\"/home/joosep/particleflow/experiments/cms_20210828_144012_433706.joosep-desktop//weights/weights-03-28.697701.hdf5\")\n",
+    "ret = model.predict(X_val, batch_size=1, verbose=1)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "western-petersburg",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -137,7 +154,7 @@
     "\n",
     "    for ielem in range(6400):\n",
     "        if X_val[0, ielem, 0] != 0:\n",
-    "            for ibin in range(bs.shape[1]):\n",
+    "            for ibin in range(bs.shape[0]):\n",
     "                if ielem in bs[ibin]:\n",
     "                    bin_index.append(ibin)\n",
     "                    break\n",
@@ -149,89 +166,102 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "possible-prime",
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk = X_val[0][:, 0] != 0\n",
-    "eta = X_val[0][msk, 2]\n",
-    "phi = X_val[0][msk, 3]\n",
-    "typ = X_val[0][msk, 0]\n",
-    "energy = X_val[0][msk, 4]\n",
+    "def plot_binning_in_layer(layer_name):\n",
+    "    msk = X_val[0][:, 0] != 0\n",
+    "    eta = X_val[0][msk, 2]\n",
+    "    phi = X_val[0][msk, 3]\n",
+    "    typ = X_val[0][msk, 0]\n",
+    "    energy = X_val[0][msk, 4]\n",
     "\n",
-    "evenly_spaced_interval = np.linspace(0, 1, 10)\n",
-    "colorlist = [cm.rainbow(x) for x in evenly_spaced_interval]\n",
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer\"][\"bins\"][0].numpy())\n",
+    "    evenly_spaced_interval = np.linspace(0, 1, ret[layer_name][\"bins\"].shape[1])\n",
+    "    colorlist = [cm.Dark2(x) for x in evenly_spaced_interval]\n",
+    "    bin_idx = get_bin_index(ret[layer_name][\"bins\"][0])\n",
     "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in classification layer 1\")\n",
-    "plt.savefig(\"bins_cls_layer1.pdf\")"
+    "    plt.figure(figsize=(4,4))\n",
+    "    plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\", s=energy)\n",
+    "    plt.xlabel(\"eta\")\n",
+    "    plt.ylabel(\"phi\")\n",
+    "    plt.title(\"Binning in {}\".format(layer_name))\n",
+    "    plt.savefig(\"bins_{}.pdf\".format(layer_name))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "listed-quarterly",
    "metadata": {},
    "outputs": [],
    "source": [
-    "evenly_spaced_interval = np.linspace(0, 1, 10)\n",
-    "colorlist = [cm.rainbow(x) for x in evenly_spaced_interval]\n",
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer_1\"][\"bins\"][0].numpy())\n",
-    "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in classification layer 2\")\n",
-    "plt.savefig(\"bins_cls_layer2.pdf\")"
+    "plot_binning_in_layer(\"cg_0\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "convenient-winner",
    "metadata": {},
    "outputs": [],
    "source": [
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer_2\"][\"bins\"][0].numpy())\n",
-    "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in regression layer 1\")\n",
-    "plt.savefig(\"bins_reg_layer1.pdf\")"
+    "plot_binning_in_layer(\"cg_1\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "cardiac-regression",
    "metadata": {},
    "outputs": [],
    "source": [
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer_3\"][\"bins\"][0].numpy())\n",
-    "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in regression layer 1\")\n",
-    "plt.savefig(\"bins_reg_layer2.pdf\")"
+    "plot_binning_in_layer(\"cg_2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "religious-rendering",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_binning_in_layer(\"cg_energy_0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "weekly-penetration",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_binning_in_layer(\"cg_energy_1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "upper-rapid",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_binning_in_layer(\"cg_energy_2\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "superior-waterproof",
    "metadata": {},
    "outputs": [],
    "source": [
     "def plot_dms(dms):\n",
     "    fig = plt.figure(figsize=(4*4, 3*4))\n",
-    "    for i in range(dmn.shape[0]):\n",
-    "        ax = plt.subplot(3,4,i+1)\n",
+    "    for i in range(len(dms)):\n",
+    "        ax = plt.subplot(4,4,i+1)\n",
     "        plt.axes(ax)\n",
-    "        plt.imshow(dmn[i], interpolation=\"none\", norm=matplotlib.colors.Normalize(vmin=0, vmax=1), cmap=\"Blues\")\n",
+    "        plt.imshow(dms[i], interpolation=\"none\", norm=matplotlib.colors.Normalize(vmin=0, vmax=1), cmap=\"Blues\")\n",
     "        plt.colorbar()\n",
     "        plt.title(\"bin {}\".format(i))\n",
     "        #plt.xlabel(\"elem index $i$\")\n",
@@ -242,58 +272,330 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "harmful-ultimate",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for layer in ['cg_0', 'cg_1', 'cg_2']:\n",
+    "    dm_vals = ret[layer]['dm'].flatten()\n",
+    "    plt.hist(dm_vals[dm_vals!=0], bins=np.linspace(0,1,100), density=True, alpha=0.8, lw=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "streaming-license",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for layer in ['cg_energy_0', 'cg_energy_1', 'cg_energy_2']:\n",
+    "    dm_vals = ret[layer]['dm'].flatten()\n",
+    "    plt.hist(dm_vals[dm_vals!=0], bins=np.linspace(0,1,100), density=True, alpha=0.8, lw=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "celtic-techno",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "dmn = preds[\"combined_graph_layer\"][\"dm\"][0].numpy()\n",
+    "dmn = ret['cg_0']['dm'][0, :, :, :, 0]\n",
     "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, classification layer 1\", y=1.01)\n",
-    "plt.savefig(\"dm_cls1.pdf\")"
+    "plt.suptitle(\"Learned adjacency, cg_0\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_0.pdf\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "silent-medium",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "dmn = preds[\"combined_graph_layer_1\"][\"dm\"][0].numpy()\n",
+    "dmn = ret['cg_1']['dm'][0, :, :, :, 0]\n",
     "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, classification layer 2\", y=1.01)\n",
-    "plt.savefig(\"dm_cls2.pdf\")"
+    "plt.suptitle(\"Learned adjacency, cg_1\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_1.pdf\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "prostate-spider",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dmn = preds[\"combined_graph_layer_2\"][\"dm\"][0].numpy()\n",
+    "dmn = ret['cg_2']['dm'][0, :, :, :, 0]\n",
     "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, regression layer 1\", y=1.01)\n",
-    "plt.savefig(\"dm_reg1.pdf\")"
+    "plt.suptitle(\"Learned adjacency, cg_2\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_2.pdf\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "certified-enforcement",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dmn = preds[\"combined_graph_layer_3\"][\"dm\"][0].numpy()\n",
+    "dmn = ret['cg_energy_0']['dm'][0, :, :, :, 0]\n",
     "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, regression layer 2\", y=1.01)\n",
-    "plt.savefig(\"dm_reg2.pdf\")"
+    "plt.suptitle(\"Learned adjacency, cg_energy_0\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_energy_0.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "portuguese-automation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dmn = ret['cg_energy_1']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_energy_1\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_energy_1.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "superb-explorer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dmn = ret['cg_energy_2']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_energy_2\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_energy_2.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "organized-unemployment",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msk = X_val[0][:, 0]!=0\n",
+    "sel = ret['dec_output'][0][msk]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "competitive-flashing",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.scatter(sel[:, 40], sel[:, 60], marker=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "maritime-beaver",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.array(X_val[:1, :, 0]!=0, np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acting-combat",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret['dec_output_energy'].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "governmental-height",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pred_debug1 = model.output_dec([\n",
+    "    X_val,\n",
+    "    ret['dec_output'],\n",
+    "    ret['dec_output_energy'],\n",
+    "    np.array(X_val[:, :, 0:1]!=0, np.float32)],\n",
+    "    training=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "worse-album",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "true_id = np.argmax(ycand_val[\"cls\"], axis=-1)\n",
+    "pred_id1 = np.argmax(pred_debug1[\"cls\"], axis=-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "combined-convention",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(4,4))\n",
+    "msk1 = (X_val[:, :, 0]!=0) & (true_id==2)\n",
+    "plt.scatter(\n",
+    "    pred_debug1[\"energy\"][msk1][:, 0].numpy(),\n",
+    "    ycand_val[\"energy\"][msk1][:, 0].numpy(),\n",
+    "    marker=\".\", alpha=0.4\n",
+    ")\n",
+    "\n",
+    "#plt.plot([-1,1], [-1,1], color=\"black\")\n",
+    "\n",
+    "plt.plot([0,6], [0,6], color=\"black\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caroline-afghanistan",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.cg[0].trainable = False\n",
+    "model.cg[1].trainable = False\n",
+    "model.cg[2].trainable = False\n",
+    "\n",
+    "# model.cg_energy[0].trainable = False\n",
+    "# model.cg_energy[1].trainable = False\n",
+    "# model.cg_energy[2].trainable = False\n",
+    "\n",
+    "model.output_dec.ffn_id.trainable = False\n",
+    "model.output_dec.ffn_charge.trainable = False\n",
+    "model.output_dec.ffn_phi.trainable = False\n",
+    "model.output_dec.ffn_eta.trainable = False\n",
+    "model.output_dec.ffn_pt.trainable = False\n",
+    "model.output_dec.ffn_energy.trainable = True\n",
+    "\n",
+    "model.output_dec.layernorm.trainable = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "later-hudson",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "[w.name for w in model.trainable_weights]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "contemporary-peeing",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_weights = tf.constant([0.0, 0.01, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "scheduled-proposal",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loss = tf.keras.losses.Huber()\n",
+    "optimizer = tf.keras.optimizers.Adam(lr=1e-4)\n",
+    "for epoch in range(100):\n",
+    "    with tf.GradientTape() as tape:\n",
+    "        y_pred = model(X_val[:2], training=True)\n",
+    "        pred_cls = tf.argmax(y_pred[\"cls\"], axis=-1)\n",
+    "        true_cls = tf.argmax(ycand_val[\"cls\"][:2], axis=-1)\n",
+    "        msk_loss = tf.expand_dims(tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32), axis=-1)\n",
+    "        sample_weights = tf.keras.activations.softmax(ycand_val[\"cls\"][:2]*100)*class_weights\n",
+    "        sample_weights = tf.reduce_sum(class_weights, axis=-1, keepdims=True)\n",
+    "        loss_val = loss(ycand_val[\"energy\"][:2]*msk_loss, y_pred[\"energy\"][:2]*msk_loss, sample_weight=sample_weights)\n",
+    "        print(loss_val)\n",
+    "    trainable_vars = model.trainable_variables\n",
+    "    gradients = tape.gradient(loss_val, trainable_vars)\n",
+    "    optimizer.apply_gradients(zip(gradients, trainable_vars))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acoustic-opening",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = model(X_val[2:6], training=False)\n",
+    "\n",
+    "true_id = tf.argmax(ycand_val[\"cls\"][2:6], axis=-1)\n",
+    "pred_id = tf.argmax(y_pred[\"cls\"], axis=-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "accompanied-musical",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sklearn.metrics.confusion_matrix(true_id.numpy().flatten(), pred_id.numpy().flatten())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ahead-literature",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(4,4))\n",
+    "cls = 3\n",
+    "print(np.sum((true_id==cls) & (pred_id==cls)))\n",
+    "plt.scatter(\n",
+    "    y_pred[\"energy\"][(true_id==cls) & (pred_id==cls)],\n",
+    "    ycand_val[\"energy\"][2:6][(true_id==cls) & (pred_id==cls)],\n",
+    "    marker=\".\"\n",
+    ")\n",
+    "plt.plot([0,6], [0,6], color=\"black\")\n",
+    "plt.xlim(0,6)\n",
+    "plt.ylim(0,6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "progressive-auckland",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vals = y_pred[\"energy\"][(true_id!=0)] - ycand_val[\"energy\"][2:6][(true_id!=0)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "lesser-grant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(vals.numpy().flatten(), bins=np.linspace(-2,2,100));\n",
+    "plt.yscale(\"log\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "deluxe-twenty",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
new file mode 100644
index 000000000..0a1b2f203
--- /dev/null
+++ b/parameters/cms-dev.yaml
@@ -0,0 +1,153 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 2
+  num_events_train: 80000
+  num_events_test: 10000
+  num_epochs: 100
+  num_val_files: 10
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 128
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairTrainableKernel
+      output_dim: 8
+      hidden_dim: 32
+      num_layers: 2
+      activation: gelu
+    node_message:
+      type: NodeMessageLearnable
+      output_dim: 256
+      hidden_dim: 128
+      num_layers: 2
+      activation: gelu
+      aggregation_direction: dst
+    num_node_messages: 1
+    hidden_dim: 256
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 2000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
new file mode 100644
index 000000000..f7df7746b
--- /dev/null
+++ b/parameters/cms-gen.yaml
@@ -0,0 +1,144 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: gen
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 4
+  num_events_train: 80000
+  num_events_test: 10000
+  num_epochs: 50
+  num_val_files: 10
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  combined_graph_layer:
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 64
+    layernorm: no
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 2
+    node_message:
+      type: GHConvDense
+      output_dim: 512
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 512
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 512
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 3
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 2000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gnn-dense-big.yaml b/parameters/cms-gnn-dense-big.yaml
deleted file mode 100644
index aa3de4a6f..000000000
--- a/parameters/cms-gnn-dense-big.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 1
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: 100
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 3200
-  clip_value_low: 0.0
-  num_conv: 3
-  num_gsl: 3
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-dense-focal.yaml b/parameters/cms-gnn-dense-focal.yaml
deleted file mode 100644
index 5db4d2177..000000000
--- a/parameters/cms-gnn-dense-focal.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 5.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.01
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config:
-  lr: 2e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: 100
-  dtype: float32
-  trainable: classification
-  classification_loss_type: sigmoid_focal_crossentropy
-  focal_loss_alpha: 0.25
-  focal_loss_gamma: 3.0
-  focal_loss_from_logits: False
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.01
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.2
-  separate_momentum: yes
-  input_encoding: cms
-  debug: no
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
deleted file mode 100644
index ce6fcc2fb..000000000
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  pt_loss: MeanSquaredLogarithmicError
-  energy_loss: MeanSquaredLogarithmicError
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config: all
-  lr: 3e-4
-  batch_size: 32
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 400
-  num_val_files: 100
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy  # categorical_cross_entropy, sigmoid_focal_crossentropy
-  lr_schedule: onecycle  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: inverse_sqrt
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.0
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
-  separate_momentum: yes
-  input_encoding: cms
-  debug: no
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
-
-onecycle:
-  mom_min: 0.85
-  mom_max: 0.95
-  warmup_ratio: 0.3
-  div_factor: 25.0
-  final_div: 100000.0
\ No newline at end of file
diff --git a/parameters/cms-gnn-dense-transfer.yaml b/parameters/cms-gnn-dense-transfer.yaml
deleted file mode 100644
index 8b735f859..000000000
--- a/parameters/cms-gnn-dense-transfer.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights: experiments/cms-gnn-dense-2cc4e7f9.gpu0.local/weights-500-40.204285.hdf5
-  lr: 1e-5
-  batch_size: 20
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 100
-  num_val_files: 100
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: transfer
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.0
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
deleted file mode 100644
index 49d2e1abc..000000000
--- a/parameters/cms-gnn-dense.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 5.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.01
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config:
-  lr: 1e-4
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: 100
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: inverse_sqrt
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.01
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.2
-  separate_momentum: yes
-  input_encoding: cms
-  debug: no
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-skipconn-v2.yaml b/parameters/cms-gnn-skipconn-v2.yaml
deleted file mode 100644
index e69919342..000000000
--- a/parameters/cms-gnn-skipconn-v2.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: -1
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn
-  bin_size: 640
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16
-  hidden_dim_id: 512
-  hidden_dim_reg: 512
-  distance_dim: 32
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: yes
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-skipconn.yaml b/parameters/cms-gnn-skipconn.yaml
deleted file mode 100644
index b1d2e50f0..000000000
--- a/parameters/cms-gnn-skipconn.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: -1
-  dtype: float32
-  sample_weights: none
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn
-  bin_size: 640
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16
-  hidden_dim_id: 512
-  hidden_dim_reg: 512
-  distance_dim: 32
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: yes
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-transformer-skipconn-gun.yaml b/parameters/cms-transformer-skipconn-gun.yaml
deleted file mode 100644
index f1fdd39e9..000000000
--- a/parameters/cms-transformer-skipconn-gun.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 256
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 10000.0
-  momentum_loss_coef: 0.0
-  charge_loss_coef: 0.0
-  momentum_loss_coefs:
-    - 1.0
-    - 10.0
-    - 100.0
-    - 100.0
-    - 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/gun/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 100
-  num_events_train: 250000
-  num_events_test: 50000
-  num_epochs: 100
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: all
-  multi_output: yes
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: transformer
-  num_layers: 2
-  d_model: 256
-  num_heads: 2
-  dff: 256
-  support: 32
-  skip_connection: yes
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-transformer-skipconn.yaml b/parameters/cms-transformer-skipconn.yaml
deleted file mode 100644
index 0cb6eeb31..000000000
--- a/parameters/cms-transformer-skipconn.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 0.001
-  eta_loss_coef: 0.001
-  sin_phi_loss_coef: 0.001
-  cos_phi_loss_coef: 0.001
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 1000
-  dtype: float32
-  sample_weights: none
-  trainable: cls
-  multi_output: yes
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: transformer
-  num_layers: 2
-  d_model: 512
-  num_heads: 2
-  dff: 512
-  support: 32
-  skip_connection: yes
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
new file mode 100644
index 000000000..be31ed14b
--- /dev/null
+++ b/parameters/cms.yaml
@@ -0,0 +1,221 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 5
+  num_events_train: 80000
+  num_events_test: 10000
+  num_epochs: 50
+  num_val_files: 10
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
+
+# LR Schedules
+exponentialdecay:
+  decay_steps: 2000
+  decay_rate: 0.99
+  staircase: yes
+onecycle:
+  mom_min: 0.85
+  mom_max: 0.95
+  warmup_ratio: 0.3
+  div_factor: 25.0
+  final_div: 100000.0
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  combined_graph_layer:
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: yes
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 512
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 3
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: yes
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  plot_freq: 10
+  tensorboard:
+    dump_history: yes
+    hist_freq: 1
+
+hypertune:
+  algorithm: hyperband  # random, bayesian, hyperband
+  random:
+    objective: val_loss
+    max_trials: 100
+  bayesian:
+    objective: val_loss
+    max_trials: 100
+    num_initial_points: 2
+  hyperband:
+    objective: val_loss
+    max_epochs: 100
+    factor: 3
+    iterations: 1
+    executions_per_trial: 1
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched: "asha"  # asha, hyperband
+  parameters:
+    # optimizer parameters
+    lr: [1e-4]
+    batch_size: [32]
+    expdecay_decay_steps: [10000]
+    # model parameters
+    combined_graph_layer:
+      layernorm: [False]
+      hidden_dim: [64, 128, 256]
+      distance_dim: [128, 256]
+      num_node_messages: [1]
+      node_message:
+        normalize_degrees: [True]
+        output_dim: [64, 128, 256]
+      dropout: [0.0]
+      bin_size: [80, 160, 320]
+      kernel:
+        clip_value_low: [0.0]
+    num_graph_layers_common: [2, 3, 4]
+    num_graph_layers_energy: [2, 3, 4]
+  # Tune schedule specific parameters
+  asha:
+    max_t: 100
+    reduction_factor: 3
+    brackets: 1
+    grace_period: 5
+  hyperband:
+    max_t: 100
+    reduction_factor: 3
diff --git a/parameters/delphes-gnn-skipconn-onecycle.yaml b/parameters/delphes-gnn-skipconn-onecycle.yaml
deleted file mode 100644
index 16259d6b6..000000000
--- a/parameters/delphes-gnn-skipconn-onecycle.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  #(none=0, track=1, cluster=2)
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  momentum_loss_coefs:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    - 0.001
-  raw_path: ../data/mlpf_zenodo/pythia8_ttbar/raw/*.pkl.bz2
-  processed_path: ../data/mlpf_zenodo/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: ../data/mlpf_zenodo/pythia8_qcd/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config: all
-  lr: 1e-5
-  batch_size: 16
-  num_events_train: 40000
-  num_events_test: 5000
-  num_epochs: 250
-  num_val_files: -1
-  dtype: float32
-  trainable: all
-  multi_output: yes
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: onecycle  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-
-parameters:
-  model: gnn
-  bin_size: 128
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16 
-  hidden_dim_id: 256
-  hidden_dim_reg: 256
-  distance_dim: 256
-  dropout: 0.2
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
-
-onecycle:
-  mom_min: 0.85
-  mom_max: 0.95
-  warmup_ratio: 0.3
-  div_factor: 25.0
-  final_div: 100000.0
\ No newline at end of file
diff --git a/parameters/delphes-gnn-skipconn.yaml b/parameters/delphes-gnn-skipconn.yaml
deleted file mode 100644
index 0f83160a2..000000000
--- a/parameters/delphes-gnn-skipconn.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  #(none=0, track=1, cluster=2)
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  momentum_loss_coefs:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    - 0.001
-  raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
-  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights: 
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 100
-  num_events_test: 100
-  num_epochs: 400
-  num_val_files: -1
-  dtype: float32
-  trainable: all
-  multi_output: no
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-
-parameters:
-  model: gnn
-  bin_size: 128
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16 
-  hidden_dim_id: 256
-  hidden_dim_reg: 256
-  distance_dim: 256
-  dropout: 0.2
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/delphes-transformer-skipconn.yaml b/parameters/delphes-transformer-skipconn.yaml
deleted file mode 100644
index 9874e5289..000000000
--- a/parameters/delphes-transformer-skipconn.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  momentum_loss_coefs:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    - 0.001
-  raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
-  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/pythia8_ttbar/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 40000
-  num_events_test: 5000
-  num_epochs: 300
-  num_val_files: -1
-  dtype: float16
-  trainable: all
-  multi_output: no
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-
-parameters:
-  model: transformer
-  num_layers: 4
-  d_model: 128
-  num_heads: 4
-  dff: 128
-  support: 32
-  skip_connection: yes
-  dropout: 0.2
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
new file mode 100644
index 000000000..6d46b54f0
--- /dev/null
+++ b/parameters/delphes.yaml
@@ -0,0 +1,216 @@
+backend: tensorflow
+
+dataset:
+  schema: delphes
+  target_particles: gen
+  num_input_features: 12
+  num_output_features: 7
+  #(none=0, track=1, cluster=2)
+  num_input_classes: 3
+  #(none=0, charged hadron=1, neutral hadron=2, photon=3, electron=4, muon=5) 
+  num_output_classes: 6
+  num_momentum_outputs: 5
+  padded_num_elem_size: 6400
+  classification_loss_coef: 1.0
+  charge_loss_coef: 1.0
+  pt_loss_coef: 100.0
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 100.0
+  cos_phi_loss_coef: 100.0
+  energy_loss_coef: 100.0
+  raw_path: data/pythia8_ttbar/raw/*.pkl*
+  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
+  num_files_per_chunk: 5
+  validation_file_path: data/pythia8_qcd/val/*.pkl*
+  energy_loss:
+    type: Huber
+    delta: 1.0
+  pt_loss:
+    type: Huber
+    delta: 1.0
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-4
+  batch_size: 4
+  num_events_train: 45000
+  num_events_test: 5000
+  num_epochs: 10
+  num_val_files: 5
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
+
+# LR Schedules
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
+onecycle:
+  mom_min: 0.85
+  mom_max: 0.95
+  warmup_ratio: 0.3
+  div_factor: 25.0
+  final_div: 100000.0
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: default
+  combined_graph_layer:
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    num_node_messages: 1
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    node_message:
+      type: GHConvDense
+      output_dim: 256
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: yes
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 4
+    charge_num_layers: 2
+    pt_num_layers: 3
+    eta_num_layers: 3
+    phi_num_layers: 3
+    energy_num_layers: 3
+    layernorm: yes
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
+
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  plot_freq: 10
+  tensorboard:
+    dump_history: yes
+    hist_freq: 1
+
+hypertune:
+  algorithm: hyperband  # random, bayesian, hyperband
+  random:
+    objective: val_loss
+    max_trials: 100
+  bayesian:
+    objective: val_loss
+    max_trials: 100
+    num_initial_points: 2
+  hyperband:
+    objective: val_loss
+    max_epochs: 100
+    factor: 3
+    iterations: 1
+    executions_per_trial: 1
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched: "asha"  # asha, hyperband
+  parameters:
+    # optimizer parameters
+    lr: [1e-4]
+    batch_size: [32]
+    expdecay_decay_steps: [10000]
+    # model parameters
+    combined_graph_layer:
+      layernorm: [False]
+      hidden_dim: [64, 128, 256]
+      distance_dim: [128, 256]
+      num_node_messages: [1]
+      node_message:
+        normalize_degrees: [True]
+        output_dim: [64, 128, 256]
+      dropout: [0.0]
+      bin_size: [80, 160, 320]
+      kernel:
+        clip_value_low: [0.0]
+    num_graph_layers_common: [2, 3, 4]
+    num_graph_layers_energy: [2, 3, 4]
+  # Tune schedule specific parameters
+  asha:
+    max_t: 100
+    reduction_factor: 3
+    brackets: 1
+    grace_period: 5
+  hyperband:
+    max_t: 100
+    reduction_factor: 3
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
deleted file mode 100644
index 3b14e661a..000000000
--- a/parameters/test-cms-v2.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-  num_input_classes: 12
-  #(none, ch.had, n.had, hfem, hfhad, gamma, e, mu)
-  num_output_classes: 8
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 128
-  bin_size: 320
-  clip_value_low: 0.0
-  num_conv: 1
-  num_gsl: 1
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
-  separate_momentum: yes
-  input_encoding: cms
-  debug: no
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
deleted file mode 100644
index a6e4f1967..000000000
--- a/parameters/test-cms.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-  num_input_classes: 12
-  #(none, ch.had, n.had, hfem, hfhad, gamma, e, mu)
-  num_output_classes: 8
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-  
-parameters:
-  model: gnn
-  bin_size: 64
-  num_convs_id: 1
-  num_convs_reg: 1
-  num_hidden_id_enc: 1
-  num_hidden_id_dec: 0
-  num_hidden_reg_enc: 1
-  num_hidden_reg_dec: 0
-  num_neighbors: 16 
-  hidden_dim_id: 64
-  hidden_dim_reg: 64
-  distance_dim: 64
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
deleted file mode 100644
index 87c7208fe..000000000
--- a/parameters/test-delphes.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/pythia8_ttbar/*.pkl.bz2
-  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/pythia8_ttbar/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-  
-parameters:
-  model: gnn
-  bin_size: 64
-  num_convs_id: 1
-  num_convs_reg: 1
-  num_hidden_id_enc: 1
-  num_hidden_id_dec: 0
-  num_hidden_reg_enc: 1
-  num_hidden_reg_dec: 0
-  num_neighbors: 16 
-  hidden_dim_id: 64
-  hidden_dim_reg: 64
-  distance_dim: 64
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/test-gnn/cms-0l.yaml b/parameters/test-gnn/cms-0l.yaml
new file mode 100644
index 000000000..5977abbc6
--- /dev/null
+++ b/parameters/test-gnn/cms-0l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 20
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: no
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 0
+  num_graph_layers_energy: 0
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-1l.yaml b/parameters/test-gnn/cms-lsh-1l.yaml
new file mode 100644
index 000000000..c8c4dfb7e
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-1l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 10
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 1
+  num_graph_layers_energy: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-2l.yaml b/parameters/test-gnn/cms-lsh-2l.yaml
new file mode 100644
index 000000000..5eb0a83f2
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-2l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 5
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-3l.yaml b/parameters/test-gnn/cms-lsh-3l.yaml
new file mode 100644
index 000000000..6ac8b76c7
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-3l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 5
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-mpnn.yaml b/parameters/test-gnn/cms-lsh-mpnn.yaml
new file mode 100644
index 000000000..291cd98a5
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-mpnn.yaml
@@ -0,0 +1,153 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 4
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 32
+    max_num_bins: 500
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairTrainableKernel
+      output_dim: 8
+      hidden_dim: 32
+      num_layers: 2
+      activation: gelu
+    node_message:
+      type: NodeMessageLearnable
+      output_dim: 256
+      hidden_dim: 128
+      num_layers: 2
+      activation: gelu
+      aggregation_direction: src
+    num_node_messages: 1
+    hidden_dim: 256
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-nolsh-1l.yaml b/parameters/test-gnn/cms-nolsh-1l.yaml
new file mode 100644
index 000000000..697aac9ed
--- /dev/null
+++ b/parameters/test-gnn/cms-nolsh-1l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 2
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: no
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 1
+  num_graph_layers_energy: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/scripts/get_all_data_delphes.sh b/scripts/get_all_data_delphes.sh
new file mode 100644
index 000000000..a5c57d547
--- /dev/null
+++ b/scripts/get_all_data_delphes.sh
@@ -0,0 +1,53 @@
+# this script assumes you git cloned the repo and are inside the particleflow/scripts directory
+# you can run the script using ./get_all_data_delphes.sh
+
+#!/bin/bash
+set -e
+
+rm -Rf test_tmp_delphes
+mkdir test_tmp_delphes
+cd test_tmp_delphes
+
+mkdir -p experiments
+
+mkdir -p data/pythia8_ttbar
+mkdir -p data/pythia8_ttbar/raw
+mkdir -p data/pythia8_ttbar/processed
+
+mkdir -p data/pythia8_qcd
+mkdir -p data/pythia8_qcd/raw
+mkdir -p data/pythia8_qcd/processed
+
+# now get the ttbar data for training/testing
+cd data/pythia8_ttbar/raw/
+
+for j in {0..9}
+do
+  for i in {0..49}
+  do
+    wget --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_"$j"_"$i".pkl.bz2
+  done
+done
+
+bzip2 -d *
+
+# now get the qcd data for extra validation
+cd ../../pythia8_qcd/raw/
+
+for i in {0..49}
+do
+    wget --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_"$i".pkl.bz2
+done
+
+bzip2 -d *
+
+# be in test_tmp_delphes when you process the files.. so the next cd tries to ensure that..
+cd ../../../
+
+#generate pytorch data files from pkl files
+python3 ../particleflow/mlpf/pytorch/graph_data_delphes.py --dataset data/pythia8_ttbar \
+  --processed_dir data/pythia8_ttbar/processed --num-files-merge 1 --num-proc 1
+
+#generate pytorch data files from pkl files
+python3 ../particleflow/mlpf/pytorch/graph_data_delphes.py --dataset data/pythia8_qcd \
+  --processed_dir data/pythia8_qcd/processed --num-files-merge 1 --num-proc 1
diff --git a/scripts/local_test_cms.sh b/scripts/local_test_cms.sh
deleted file mode 100755
index b95b0c9f9..000000000
--- a/scripts/local_test_cms.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-set -e
-
-rm -Rf test_tmp
-mkdir test_tmp
-cd test_tmp
-
-mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi
-cd data/TTbar_14TeV_TuneCUETP8M1_cfi
-
-#download the root input file
-wget --no-check-certificate https://login-1.hep.caltech.edu/~jpata/particleflow/2020-07/TTbar_14TeV_TuneCUETP8M1_cfi/pfntuple_1.root
-cd ../..
-
-mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
-mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/processed
-
-#generate pickle data files from root
-python3 ../mlpf/data/postprocessing2.py --input data/TTbar_14TeV_TuneCUETP8M1_cfi/pfntuple_1.root \
-  --events-per-file 1 --outpath data/TTbar_14TeV_TuneCUETP8M1_cfi/raw --save-normalized-table
-
-#generate pytorch data files
-python3 ../mlpf/pytorch/graph_data_cms.py --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi \
-  --processed_dir data/TTbar_14TeV_TuneCUETP8M1_cfi/processed --num-files-merge 1 --num-proc 1
-
-#run the pytorch training
-COMET_API_KEY="bla" python3 ../mlpf/pytorch/train_end2end_cms.py \
-  --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi --space_dim 2 --n_train 3 \
-  --n_val 2 --model PFNet7 --convlayer gravnet-radius --convlayer2 sgconv \
-  --lr 0.0001 --hidden_dim 32 --n_epochs 2 --l1 1.0 --l2 0.001 --target cand \
-  --batch_size 1 --dropout 0.2 --disable_comet
-
-# #generate dataframe with predictions from the pytorch model
-python3 ../mlpf/pytorch/eval_end2end_cms.py --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi \
-  --path data/PFNet* --model PFNet7 --start 3 --stop 5 --epoch 1
-
-export OUTFILE=`find data -name df.pkl.bz2 | head -n1`
-du $OUTFILE
-python3 ../mlpf/plotting/plots_cms.py --pkl $OUTFILE --target cand
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 2f10ec7e2..4614aef06 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -4,14 +4,13 @@ set -e
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi
 
 mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/root
-cd data/TTbar_14TeV_TuneCUETP8M1_cfi/root
 
-#Only CMS-internal use is permitted by CMS rules
+#Only CMS-internal use is permitted by CMS rules! Do not use these MC simulation files otherwise!
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_2.root
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_3.root
 
-cd ../../..
+mv *.root data/TTbar_14TeV_TuneCUETP8M1_cfi/root/
 
 #Create the ntuples
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
@@ -28,19 +27,18 @@ mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/val
 mv data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/pfntuple_3_0.pkl data/TTbar_14TeV_TuneCUETP8M1_cfi/val/
 
 mkdir -p experiments
-rm -Rf experiments/test-*
 
 #Run a simple training on a few events
-rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
-python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action data
+rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand
+python3 mlpf/pipeline.py data -c parameters/cms.yaml
 
 #Run a simple training on a few events
-python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
+python3 mlpf/pipeline.py train -c parameters/cms.yaml --nepochs 2 --ntrain 5 --ntest 5
 
-#Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms.yaml -t ./experiments/test-cms-*
+ls ./experiments/cms_*/weights/
 
-python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
+#Generate the pred.npz file of predictions
+python3 mlpf/pipeline.py evaluate -c parameters/cms.yaml -t ./experiments/cms_*
 
-python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-v2-
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-v2-*
+#Load the model
+python3 scripts/test_load_tfmodel.py ./experiments/cms_*/model_frozen/frozen_graph.pb
\ No newline at end of file
diff --git a/scripts/local_test_cms_pytorch.sh b/scripts/local_test_cms_pytorch.sh
new file mode 100755
index 000000000..bbb6968ea
--- /dev/null
+++ b/scripts/local_test_cms_pytorch.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# set -e
+#
+# rm -Rf test_tmp
+# mkdir test_tmp
+cd test_tmp
+
+# mkdir -p experiments
+# mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi
+# cd data/TTbar_14TeV_TuneCUETP8M1_cfi
+#
+# #download the root input file
+# wget --no-check-certificate https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root
+# cd ../..
+#
+# mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
+# mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/processed
+#
+# #generate pickle data files from root
+# python3 ../../mlpf/data/postprocessing2.py --input data/TTbar_14TeV_TuneCUETP8M1_cfi/pfntuple_1.root \
+#   --events-per-file 1 --outpath data/TTbar_14TeV_TuneCUETP8M1_cfi/raw --save-normalized-table
+
+#generate pytorch data files
+python3 ../../mlpf/pytorch_cms/graph_data_delphes.py --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi \
+  --processed_dir data/TTbar_14TeV_TuneCUETP8M1_cfi/processed --num-files-merge 1 --num-proc 1
+#
+# #run the pytorch training
+# echo Beginning the training..
+# python3 pipeline_cms.py \
+#   --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
+#   --dataset='../../test_tmp/data/TTbar_14TeV_TuneCUETP8M1_cfi' \
+#   --outpath='../../test_tmp/experiments'
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 3117f8033..6cf616fef 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -1,30 +1,25 @@
 #!/bin/bash
 set -e
 
-mkdir -p data/pythia8_ttbar
-mkdir -p data/pythia8_ttbar/val
-cd data/pythia8_ttbar
+mkdir -p data/pythia8_ttbar/raw
+mkdir -p data/pythia8_qcd/val
 
 #download a test input file (you can also download everything from Zenodo at 10.5281/zenodo.4559324)
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
-wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2
-mv tev14_pythia8_ttbar_0_1.pkl.bz2 val/
-
-cd ../..
+mv tev14_pythia8_ttbar_0_0.pkl.bz2 data/pythia8_ttbar/raw/
+wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
+mv tev14_pythia8_qcd_10_0.pkl.bz2 data/pythia8_qcd/val/
 
 mkdir -p experiments
-rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/pythia8_ttbar/tfr
-python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action data
+python3 mlpf/pipeline.py data -c parameters/delphes.yaml
 
 #Run a simple training on a few events
-python3 mlpf/pipeline.py train -c parameters/test-delphes.yaml -p test-delphes-
-
-#Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/test-delphes.yaml -t ./experiments/test-delphes-*
+python3 mlpf/pipeline.py train -c parameters/delphes.yaml --nepochs 2 --ntrain 5 --ntest 5
 
-#Generate the timing file
-python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action time --weights ./experiments/test-delphes-*/weights/weights-01-*.hdf5
+ls ./experiments/delphes_*/weights/
 
+#Generate the pred.npz file of predictions
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t ./experiments/delphes_*
diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index f357aead0..4d0fa84be 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -10,32 +10,54 @@ mkdir -p data/pythia8_ttbar
 mkdir -p data/pythia8_ttbar/raw
 mkdir -p data/pythia8_ttbar/processed
 
-cd data/pythia8_ttbar/raw
+mkdir -p data/pythia8_qcd
+mkdir -p data/pythia8_qcd/raw
+mkdir -p data/pythia8_qcd/processed
 
-#download some pickle data files (for this test we download 3 pkl files and allocate 2 for train and 1 for valid)
+#download 2 files for training/validation
+cd data/pythia8_ttbar/raw
+echo Downloading the training/validation data files..
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2
-wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_2.pkl.bz2
-
+bzip2 -d *
 cd ../../..
 
-# # if you have the data in place and want to avoid downloading it you can comment all of the above and uncomment the next line
-# cd test_tmp_delphes
+#download 1 file for testing
+cd data/pythia8_qcd/raw
+echo Downloading the testing data files..
+wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
+bzip2 -d *
+cd ../../..
 
 #generate pytorch data files from pkl files
-python3 ../mlpf/pytorch/graph_data_delphes.py --dataset data/pythia8_ttbar \
+echo Processing the training/validation data files..
+python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_ttbar \
   --processed_dir data/pythia8_ttbar/processed --num-files-merge 1 --num-proc 1
 
-# before training a model, first get rid of any previous models stored
-rm -Rf experiments/PFNet*
+#generate pytorch data files from pkl files
+echo Processing the testing data files..
+python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_qcd/ \
+  --processed_dir data/pythia8_qcd/processed --num-files-merge 1 --num-proc 1
+
+#before training a model, first get rid of any previous models stored
+rm -Rf experiments/*
+
+cd ../mlpf/
 
 #run the pytorch training
-COMET_API_KEY="bla" python3 ../mlpf/pytorch/train_end2end_delphes.py \
-  --dataset data/pythia8_ttbar --space_dim 2 --n_train 1 \
-  --n_val 1 --model PFNet7 --convlayer gravnet-radius --convlayer2 "none" \
-  --lr 0.0001 --hidden_dim 32 --n_epochs 3 --l1 1.0 --l2 0.001 --target gen \
-  --batch_size 1 --dropout 0.2 --disable_comet
-
-# predict on some test data and make plots
-python3 ../mlpf/pytorch/eval_end2end_delphes.py --dataset data/pythia8_ttbar \
-  --path experiments/PFNet* --model PFNet7 --start 1 --stop 2 --epoch 1 --target gen
+echo Beginning the training..
+python3 pytorch_pipeline.py \
+  --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
+  --dataset='../test_tmp_delphes/data/pythia8_ttbar' \
+  --dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
+  --outpath='../test_tmp_delphes/experiments'
+echo Finished the training..
+
+# #to run lrp uncomment the next few lines (note: lrp requires huge amounts of memory ~128Gi)
+# echo Begining the LRP machinery..
+# python3 lrp_pipeline.py \
+#   --n_test=1 --batch_size=4 \
+#   --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
+#   --lrp_outpath='../test_tmp_delphes/experiments/' \
+#   --lrp_load_model='PFNet7_gen_ntrain_1_nepochs_10_batch_size_4_lr_0.0001_alpha_0.0002_both__nn1_nn3'
+#   --lrp_load_epoch=9
diff --git a/scripts/plot_nvidiasmi_csv.py b/scripts/plot_nvidiasmi_csv.py
new file mode 100644
index 000000000..ae48633d0
--- /dev/null
+++ b/scripts/plot_nvidiasmi_csv.py
@@ -0,0 +1,99 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+from pathlib import Path
+import numpy as np
+from datetime import datetime
+import time
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-d", "--dir", type=str, default="parameters/delphes-gnn-skipconn.yaml", help="dir containing csv files"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def plot_gpu_util(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_util".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("GPU utilization [%]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
+def plot_gpu_power(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_power".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("Power consumption [W]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
+def plot_gpu_mem_util(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_mem_util".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("GPU memory utilization [%]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
+def plot_gpu_mem_used(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_mem_used".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("Used GPU memory [MiB]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
+def plot_dfs(dfs, plot_func, suffix):
+    fig, axs = plt.subplots(2, 2, figsize=(12, 9), tight_layout=True)
+    for ax in axs.flat:
+        ax.label_outer()
+
+    for cuda_device, (df, ax) in enumerate(zip(dfs, axs.flat)):
+        plot_func(df, cuda_device, ax)
+    plt.suptitle("{}".format(file.stem))
+    plt.savefig(args.dir + "/{}_{}.jpg".format(file.stem, suffix))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    csv_files = list(Path(args.dir).glob("*.csv"))
+
+    for file in csv_files:
+        print(file)
+        df = pd.read_csv(str(file))
+        start_time = df["timestamp"].iloc[0]
+        start_t = datetime.strptime(start_time, "%Y/%m/%d %H:%M:%S.%f").timestamp()
+        dfs = []
+        for ii, gpu in enumerate(np.unique(df[" pci.bus_id"].values)):
+            dfs.append(
+                pd.DataFrame(
+                    {
+                        "GPU{}_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.gpu [%]"].map(
+                            lambda x: int(x.split(" ")[1])
+                        ),
+                        "GPU{}_power".format(ii): df[df[" pci.bus_id"] == gpu][" power.draw [W]"].map(
+                            lambda x: float(x.split(" ")[1])
+                        ),
+                        "GPU{}_mem_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.memory [%]"].map(
+                            lambda x: int(x.split(" ")[1])
+                        ),
+                        "GPU{}_mem_used".format(ii): df[df[" pci.bus_id"] == gpu][" memory.used [MiB]"].map(
+                            lambda x: int(x.split(" ")[1])
+                        ),
+                        "time": df[df[" pci.bus_id"] == gpu]["timestamp"].map(
+                            lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t
+                        ),
+                    }
+                ).dropna()
+            )
+
+        plot_dfs(dfs, plot_gpu_util, "gpu_util")
+        plot_dfs(dfs, plot_gpu_power, "gpu_power")
+        plot_dfs(dfs, plot_gpu_mem_used, "gpu_mem_used")
+        plot_dfs(dfs, plot_gpu_mem_util, "gpu_mem_util")
diff --git a/scripts/test_load_tfmodel.py b/scripts/test_load_tfmodel.py
index de854e422..2e30b6346 100644
--- a/scripts/test_load_tfmodel.py
+++ b/scripts/test_load_tfmodel.py
@@ -2,7 +2,7 @@
 import sys
 import numpy as np
 
-bin_size = 128
+bin_size = 640
 num_features = 15
 
 def load_graph(frozen_graph_filename):
@@ -28,5 +28,5 @@ def load_graph(frozen_graph_filename):
 graph = load_graph(sys.argv[1])
 
 with tf.compat.v1.Session(graph=graph) as sess:
-    out = sess.run("Identity:0", feed_dict={"x:0": np.random.randn(1, 39*bin_size, num_features)})
+    out = sess.run("Identity:0", feed_dict={"x:0": np.random.randn(1, 10*bin_size, num_features)})
     print(out)