From bc1d18525ba629d060cc7bb9fe8f8510c1772284 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 16 Jul 2021 12:29:36 +0200
Subject: [PATCH 001/157] feat: Add hyperparameter optimization

Run hyperparameter optimization using the new hypertune
command in the pipeline script.
---
 mlpf/pipeline.py            | 56 +++++++++++++++++++++++++++++++++--
 mlpf/tfmodel/hypertuning.py | 59 +++++++++++++++++++++++++++++++++++++
 mlpf/tfmodel/model_setup.py |  1 -
 3 files changed, 112 insertions(+), 4 deletions(-)
 create mode 100644 mlpf/tfmodel/hypertuning.py

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 157f3d542..edc18b131 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -14,6 +14,7 @@
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
 import tensorflow_addons as tfa
+import keras_tuner as kt
 
 from tfmodel.data import Dataset
 from tfmodel.model_setup import (
@@ -47,6 +48,8 @@
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.lr_finder import LRFinder
+from tfmodel.callbacks import CustomTensorBoard
+from tfmodel import hypertuning
 
 
 @click.group()
@@ -102,7 +105,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
         # Run model once to build the layers
         print(X_val.shape)
-        model(tf.cast(X_val[:1], model_dtype))
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
         initial_epoch = 0
         if weights:
@@ -110,11 +113,11 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
             configure_model_weights(model, config["setup"].get("weights_config", "all"))
             model.load_weights(weights, by_name=True)
             initial_epoch = int(weights.split("/")[-1].split("-")[1])
-        model(tf.cast(X_val[:1], model_dtype))
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
         config = set_config_loss(config, config["setup"]["trainable"])
         configure_model_weights(model, config["setup"]["trainable"])
-        model(tf.cast(X_val[:1], model_dtype))
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
         loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
@@ -302,5 +305,52 @@ def delete_all_but_best_ckpt(train_dir, dry_run):
     delete_all_but_best_checkpoint(train_dir, dry_run)
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-o", "--outdir", help="output dir", type=click.Path())
+@click.option("--ntrain", default=None, help="override the number of training events", type=int)
+@click.option("--ntest", default=None, help="override the number of testing events", type=int)
+@click.option("-r", "--recreate", help="overwrite old hypertune results", is_flag=True, default=False)
+def hypertune(config, outdir, ntrain, ntest, recreate):
+    config, _, global_batch_size, n_train, n_test, n_epochs, _ = parse_config(config, ntrain, ntest)
+
+    ds_train_r, ds_test_r, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test)
+
+    model_builder = hypertuning.get_model_builder(config)
+
+    tb = CustomTensorBoard(
+            log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
+            update_freq=1,
+        )
+    # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
+    tb.__class__.__name__ = "TensorBoard"
+
+    tuner = kt.Hyperband(
+        model_builder,
+        objective="val_loss",
+        max_epochs=n_epochs,
+        factor=3,
+        hyperband_iterations=3,
+        directory=outdir + "/tb",
+        project_name="mlpf",
+        overwrite=recreate,
+        executions_per_trial=1,
+    )
+
+    tuner.search(
+        ds_train_r,
+        validation_data=ds_test_r,
+        steps_per_epoch=n_train // global_batch_size,
+        validation_steps=n_test // global_batch_size,
+        #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
+        callbacks=[tb],
+    )
+
+    tuner.results_summary()
+    for trial in tuner.oracle.get_best_trials(num_trials=10):
+        print(trial.hyperparameters.values, trial.score)
+
+
 if __name__ == "__main__":
     main()
diff --git a/mlpf/tfmodel/hypertuning.py b/mlpf/tfmodel/hypertuning.py
new file mode 100644
index 000000000..6221ad749
--- /dev/null
+++ b/mlpf/tfmodel/hypertuning.py
@@ -0,0 +1,59 @@
+from tensorboard.plugins.hparams import api as hp
+import tensorflow as tf
+import keras_tuner as kt
+
+from tfmodel.model_setup import make_model, FlattenedCategoricalAccuracy
+from tfmodel.model import PFNetDense
+
+from tfmodel.utils import (
+    get_lr_schedule,
+    load_config,
+    set_config_loss,
+    get_loss_dict,
+    parse_config,
+)
+
+
+def get_model_builder(config):
+
+    def model_builder(hp):
+        # config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[128, 256])
+        # config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[64, 128, 256])
+        # config["parameters"]["num_conv"] = hp.Choice("num_conv", [2, 3, 4])
+        # config["parameters"]["num_gsl"] = hp.Choice("num_gsl", [2, 3, 4, 5])
+        # config["parameters"]["dropout"] = hp.Choice("dropout", values=[0.0, 0.1, 0.2, 0.3])
+        # config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[200, 640, 800])
+
+        # config["setup"]["lr"] = hp.Choice("lr", values=[5e-4, 1e-4, 5e-5, 1e-5])
+
+
+        config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[256])
+        config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
+        config["parameters"]["num_conv"] = hp.Choice("num_conv", [2, 3])
+        config["parameters"]["num_gsl"] = hp.Choice("num_gsl", [2, 3])
+        config["parameters"]["dropout"] = hp.Choice("dropout", values=[0.2])
+        config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[640])
+
+        config["setup"]["lr"] = hp.Choice("lr", values=[1e-4])
+
+        model = make_model(config, dtype="float32")
+        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
+
+        opt = tf.keras.optimizers.Adam(learning_rate=config["setup"]["lr"])
+
+        loss_dict, loss_weights = get_loss_dict(config)
+        model.compile(
+            loss=loss_dict,
+            optimizer=opt,
+            sample_weight_mode="temporal",
+            loss_weights=loss_weights,
+            metrics={
+                "cls": [
+                    FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                    FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ]
+            },
+        )
+        return model
+
+    return model_builder
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index ee29b95af..c3ebf2c2b 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -16,7 +16,6 @@
 import matplotlib
 import matplotlib.pyplot as plt
 import sklearn
-import kerastuner as kt
 from argparse import Namespace
 import time
 import json

From 961d0f53d3b3dfa289d29503f1af9a81e5eb7106 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 22 Jul 2021 11:40:03 +0200
Subject: [PATCH 002/157] feat: Add distributed training capability in
 hypertune

---
 mlpf/pipeline.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index edc18b131..1fba98ac8 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -317,6 +317,10 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
 
     ds_train_r, ds_test_r, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test)
 
+    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+    if maybe_global_batch_size is not None:
+        global_batch_size = maybe_global_batch_size
+
     model_builder = hypertuning.get_model_builder(config)
 
     tb = CustomTensorBoard(
@@ -336,6 +340,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         project_name="mlpf",
         overwrite=recreate,
         executions_per_trial=1,
+        distribution_strategy=strategy,
     )
 
     tuner.search(

From d3355eca95eebe13329b8834e2bcdc9c87d6ebde Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Thu, 22 Jul 2021 14:24:04 -0700
Subject: [PATCH 003/157] pytorch update (training+LRP)

---
 .gitignore                                    |   9 +
 mlpf/pytorch/README.md                        |  14 -
 mlpf/pytorch/data_preprocessing.py            |  47 --
 mlpf/pytorch/eval_end2end_delphes.py          | 183 ------
 mlpf/pytorch/model.py                         | 130 -----
 mlpf/pytorch/model_general.py                 | 177 ------
 mlpf/pytorch/train_end2end_delphes.py         | 437 ---------------
 mlpf/pytorch_cms/README.md                    |   5 +
 .../eval_end2end_cms.py                       |   0
 .../graph_data_cms.py                         |   0
 mlpf/{pytorch => pytorch_cms}/gravnet.py      |   0
 .../train_end2end_cms.py                      |   0
 mlpf/pytorch_delphes/DDP_tutorial.py          | 166 ++++++
 mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py       | 258 +++++++++
 mlpf/pytorch_delphes/LRP/LRP_dnn.py           | 168 ++++++
 mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py       | 319 +++++++++++
 mlpf/pytorch_delphes/LRP/gravnet_LRP.py       | 123 ++++
 mlpf/pytorch_delphes/LRP/hooks.py             | 106 ++++
 mlpf/pytorch_delphes/LRP/main_clf.py          | 360 ++++++++++++
 mlpf/pytorch_delphes/LRP/main_dnn.py          | 227 ++++++++
 mlpf/pytorch_delphes/LRP/main_reg.py          | 480 ++++++++++++++++
 mlpf/pytorch_delphes/LRP/model_LRP_clf.py     |  98 ++++
 mlpf/pytorch_delphes/LRP/model_LRP_dnn.py     |  66 +++
 mlpf/pytorch_delphes/LRP/model_LRP_reg.py     | 109 ++++
 mlpf/pytorch_delphes/LRP/model_io.py          | 156 ++++++
 mlpf/pytorch_delphes/README.md                |  23 +
 mlpf/pytorch_delphes/args.py                  | 129 +++++
 mlpf/pytorch_delphes/data_preprocessing.py    |  94 ++++
 mlpf/pytorch_delphes/evaluate.py              | 343 ++++++++++++
 .../graph_data_delphes.py                     |  28 +-
 mlpf/pytorch_delphes/gravnet.py               | 122 ++++
 mlpf/pytorch_delphes/model.py                 | 129 +++++
 mlpf/pytorch_delphes/model_dnn.py             | 126 +++++
 mlpf/pytorch_delphes/model_embeddings.py      | 177 ++++++
 mlpf/pytorch_delphes/plots.py                 | 528 ++++++++++++++++++
 mlpf/pytorch_delphes/training.py              | 508 +++++++++++++++++
 mlpf/pytorch_delphes/training_dnn.py          | 497 +++++++++++++++++
 mlpf/pytorch_delphes/training_embeddings.py   | 512 +++++++++++++++++
 scripts/get_all_data_delphes.sh               |  53 ++
 scripts/local_test_delphes_pytorch.sh         |  48 +-
 40 files changed, 5929 insertions(+), 1026 deletions(-)
 delete mode 100644 mlpf/pytorch/README.md
 delete mode 100755 mlpf/pytorch/data_preprocessing.py
 delete mode 100755 mlpf/pytorch/eval_end2end_delphes.py
 delete mode 100755 mlpf/pytorch/model.py
 delete mode 100755 mlpf/pytorch/model_general.py
 delete mode 100755 mlpf/pytorch/train_end2end_delphes.py
 create mode 100644 mlpf/pytorch_cms/README.md
 rename mlpf/{pytorch => pytorch_cms}/eval_end2end_cms.py (100%)
 rename mlpf/{pytorch => pytorch_cms}/graph_data_cms.py (100%)
 rename mlpf/{pytorch => pytorch_cms}/gravnet.py (100%)
 rename mlpf/{pytorch => pytorch_cms}/train_end2end_cms.py (100%)
 mode change 100755 => 100644
 create mode 100644 mlpf/pytorch_delphes/DDP_tutorial.py
 create mode 100644 mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
 create mode 100644 mlpf/pytorch_delphes/LRP/LRP_dnn.py
 create mode 100644 mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
 create mode 100644 mlpf/pytorch_delphes/LRP/gravnet_LRP.py
 create mode 100644 mlpf/pytorch_delphes/LRP/hooks.py
 create mode 100644 mlpf/pytorch_delphes/LRP/main_clf.py
 create mode 100644 mlpf/pytorch_delphes/LRP/main_dnn.py
 create mode 100644 mlpf/pytorch_delphes/LRP/main_reg.py
 create mode 100644 mlpf/pytorch_delphes/LRP/model_LRP_clf.py
 create mode 100644 mlpf/pytorch_delphes/LRP/model_LRP_dnn.py
 create mode 100644 mlpf/pytorch_delphes/LRP/model_LRP_reg.py
 create mode 100644 mlpf/pytorch_delphes/LRP/model_io.py
 create mode 100644 mlpf/pytorch_delphes/README.md
 create mode 100644 mlpf/pytorch_delphes/args.py
 create mode 100644 mlpf/pytorch_delphes/data_preprocessing.py
 create mode 100644 mlpf/pytorch_delphes/evaluate.py
 rename mlpf/{pytorch => pytorch_delphes}/graph_data_delphes.py (80%)
 create mode 100644 mlpf/pytorch_delphes/gravnet.py
 create mode 100644 mlpf/pytorch_delphes/model.py
 create mode 100644 mlpf/pytorch_delphes/model_dnn.py
 create mode 100644 mlpf/pytorch_delphes/model_embeddings.py
 create mode 100644 mlpf/pytorch_delphes/plots.py
 create mode 100644 mlpf/pytorch_delphes/training.py
 create mode 100644 mlpf/pytorch_delphes/training_dnn.py
 create mode 100644 mlpf/pytorch_delphes/training_embeddings.py
 create mode 100644 scripts/get_all_data_delphes.sh

diff --git a/.gitignore b/.gitignore
index 96659e5c9..56c75e84c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,12 @@ mlpf/pytorch/data
 test_tmp/
 test_tmp_delphes/
 .DS_Store
+
+prp
+*.pyc
+*.pyo
+
+mlpf/updated/LRP/pid*
+mlpf/updated/LRP/class*
+
+*.ipynb_checkpoints
diff --git a/mlpf/pytorch/README.md b/mlpf/pytorch/README.md
deleted file mode 100644
index 01cd33211..000000000
--- a/mlpf/pytorch/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Short instructions to train on cms data:
-```bash
-cd ../..
-./scripts/local_test_cms.sh
-```
-
-Short instructions to train on delphes data:
-```bash
-cd ../..
-./scripts/local_test_delphes.sh
-```
-
-### Delphes dataset
-The dataset is available from zenodo: https://doi.org/10.5281/zenodo.4452283.
diff --git a/mlpf/pytorch/data_preprocessing.py b/mlpf/pytorch/data_preprocessing.py
deleted file mode 100755
index 2a3c3d1a8..000000000
--- a/mlpf/pytorch/data_preprocessing.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import numpy as np
-import torch
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-# define a function that casts the dataset into a dataloader for efficient NN training
-def from_data_to_loader(full_dataset, n_train, n_val, batch_size):
-
-    train_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_train))
-    valid_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=n_train, stop=n_train+n_val))
-
-    # preprocessing the train_dataset in a good format for passing correct batches of events to the GNN
-    train_dataset_batched=[]
-    for i in range(len(train_dataset)):
-        train_dataset_batched += train_dataset[i]
-    train_dataset_batched = [[i] for i in train_dataset_batched]
-
-    # preprocessing the valid_dataset in a good format for passing correct batches of events to the GNN
-    valid_dataset_batched=[]
-    for i in range(len(valid_dataset)):
-        valid_dataset_batched += valid_dataset[i]
-    valid_dataset_batched = [[i] for i in valid_dataset_batched]
-
-    #hack for multi-gpu training
-    if not multi_gpu:
-        def collate(items):
-            l = sum(items, [])
-            return Batch.from_data_list(l)
-    else:
-        def collate(items):
-            l = sum(items, [])
-            return l
-
-    train_loader = DataListLoader(train_dataset_batched, batch_size, pin_memory=True, shuffle=True)
-    train_loader.collate_fn = collate
-    valid_loader = DataListLoader(valid_dataset_batched, batch_size, pin_memory=True, shuffle=False)
-    valid_loader.collate_fn = collate
-
-    return train_loader, valid_loader
diff --git a/mlpf/pytorch/eval_end2end_delphes.py b/mlpf/pytorch/eval_end2end_delphes.py
deleted file mode 100755
index 2d3dc4ecb..000000000
--- a/mlpf/pytorch/eval_end2end_delphes.py
+++ /dev/null
@@ -1,183 +0,0 @@
-#import setGPU
-import torch
-import torch_geometric
-import sklearn
-import numpy as np
-import matplotlib.pyplot as plt
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-import pandas
-import mplhep
-import pickle
-
-import graph_data_delphes
-from graph_data_delphes import PFGraphDataset
-from data_preprocessing import from_data_to_loader
-import train_end2end_delphes
-import time
-import math
-
-import sys
-sys.path.insert(1, '../plotting/')
-sys.path.insert(1, '../mlpf/plotting/')
-
-import plots_delphes
-from plots_delphes import make_plots
-
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-def collate(items):
-    l = sum(items, [])
-    return Batch.from_data_list(l)
-
-def prepare_test_data(full_dataset, start, stop, batch_size):
-
-    test_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=start, stop=stop))
-
-    # preprocessing the test_dataset in a good format for passing correct batches of events to the GNN
-    test_dataset_batched=[]
-    for i in range(len(test_dataset)):
-        test_dataset_batched += test_dataset[i]
-    test_dataset_batched = [[i] for i in test_dataset_batched]
-
-    #hack for multi-gpu training
-    if not multi_gpu:
-        def collate(items):
-            l = sum(items, [])
-            return Batch.from_data_list(l)
-    else:
-        def collate(items):
-            l = sum(items, [])
-            return l
-
-    test_loader = DataListLoader(test_dataset_batched, batch_size, pin_memory=True, shuffle=True)
-    test_loader.collate_fn = collate
-
-    return test_loader
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, choices=sorted(train_end2end_delphes.model_classes.keys()), help="type of model to use", default="PFNet6")
-    parser.add_argument("--path", type=str, help="path to model", default="data/PFNet7_TTbar_14TeV_TuneCUETP8M1_cfi_gen__npar_221073__cfg_ee19d91068__user_jovyan__ntrain_400__lr_0.0001__1588215695")
-    parser.add_argument("--epoch", type=str, default=0, help="Epoch to use")
-    parser.add_argument("--dataset", type=str, help="Input dataset", required=True)
-    parser.add_argument("--start", type=int, default=3800, help="first file index to evaluate")
-    parser.add_argument("--stop", type=int, default=4000, help="last file index to evaluate")
-    parser.add_argument("--target", type=str, choices=["cand", "gen"], default="cand", help="type of data the model trained on (cand or gen)")
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    device = torch.device("cpu")
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'model': 'PFNet7', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'epoch' : 1, 'target': 'cand', 'start':1, 'stop':2,
-    # 'path': '../../test_tmp_delphes/experiments/PFNet7_pythia8_ttbar_gen__npar_41414__cfg_fca529f313__user_fmokhtar__ntrain_1__lr_0.0001__1611654293'})
-
-    epoch = args.epoch
-    model = args.model
-    path = args.path
-    weights = torch.load("{}/epoch_{}_weights.pth".format(path, epoch), map_location=device)
-    weights = {k.replace("module.", ""): v for k, v in weights.items()}
-
-    with open('{}/model_kwargs.pkl'.format(path),'rb') as f:
-        model_kwargs = pickle.load(f)
-
-    model_class = train_end2end_delphes.model_classes[args.model]
-    model = model_class(**model_kwargs)
-    model.load_state_dict(weights)
-    model = model.to(device)
-    model.eval()
-
-    # prepare some test_data
-    print('Creating the test data and feeding it to the model..')
-    full_dataset = PFGraphDataset(root=args.dataset)
-    loader = prepare_test_data(full_dataset, start=args.start, stop=args.stop, batch_size=10)
-
-    # TODO: here we only evaluate a forward pass of only one batch of the allocated test data
-    for batch in loader:
-        pred_id, pred_p4, new_edges_ = model(batch)
-        break
-
-    print('Making plots for evaluation..')
-
-    if args.target=='cand':
-        make_plots(batch.ycand_id, batch.ycand, pred_id, pred_p4, out=path +'/')
-    elif args.target=='gen':
-        make_plots(batch.ygen_id, batch.ygen, pred_id, pred_p4, out=path +'/')
-
-
-# def prepare_dataframe(model, loader, multi_gpu, device, target_type="cand"):
-#     model.eval()
-#     dfs = []
-#     dfs_edges = []
-#     eval_time = 0
-#
-#     for i, data in enumerate(loader):
-#         if not multi_gpu:
-#             data = data.to(device)
-#         pred_id_onehot, pred_momentum, new_edges = model(data)
-#         _, pred_id = torch.max(pred_id_onehot, -1)
-#         pred_momentum[pred_id==0] = 0
-#         data = [data]
-#
-#         x = torch.cat([d.x.to("cpu") for d in data])
-#         gen_id = torch.cat([d.ygen_id.to("cpu") for d in data])
-#         gen_p4 = torch.cat([d.ygen[:, :].to("cpu") for d in data])
-#         cand_id = torch.cat([d.ycand_id.to("cpu") for d in data])
-#         cand_p4 = torch.cat([d.ycand[:, :].to("cpu") for d in data])
-#
-#         # reverting the one_hot_embedding
-#         gen_id_flat = torch.max(gen_id, -1)[1]
-#         cand_id_flat = torch.max(cand_id, -1)[1]
-#
-#         df = pandas.DataFrame()
-#         gen_p4.shape
-#         gen_id.shape
-#
-#         # Recall:
-#         # [pid] takes from 1 to 6
-#         # [charge, pt (GeV), eta, sin phi, cos phi, E (GeV)]
-#
-#         df["elem_type"] = [int(elem_labels[i]) for i in torch.argmax(x[:, :len(elem_labels)], axis=-1).numpy()]
-#
-#         if target_type == "gen":
-#             df["gen_pid"] = [int(class_labels[i]) for i in gen_id_flat.numpy()]
-#             df["gen_charge"] = gen_p4[:, 0].numpy()
-#             df["gen_eta"] = gen_p4[:, 2].numpy()
-#             df["gen_sphi"] = gen_p4[:, 3].numpy()
-#             df["gen_cphi"] = gen_p4[:, 4].numpy()
-#             df["gen_e"] = gen_p4[:, 5].numpy()
-#
-#         elif target_type == "cand":
-#             df["cand_pid"] = [int(class_labels[i]) for i in cand_id_flat.numpy()]
-#             df["cand_charge"] = cand_p4[:, 0].numpy()
-#             df["cand_eta"] = cand_p4[:, 2].numpy()
-#             df["cand_sphi"] = cand_p4[:, 3].numpy()
-#             df["cand_cphi"] = cand_p4[:, 4].numpy()
-#             df["cand_e"] = cand_p4[:, 5].numpy()
-#
-#         df["pred_pid"] = [int(class_labels[i]) for i in pred_id.detach().cpu().numpy()]
-#         df["pred_charge"] = pred_momentum[:, 0].detach().cpu().numpy()
-#         df["pred_eta"] = pred_momentum[:, 2].detach().cpu().numpy()
-#         df["pred_sphi"] = pred_momentum[:, 3].detach().cpu().numpy()
-#         df["pred_cphi"] = pred_momentum[:, 4].detach().cpu().numpy()
-#         df["pred_e"] = pred_momentum[:, 5].detach().cpu().numpy()
-#
-#         dfs.append(df)
-#
-#     df = pandas.concat(dfs, ignore_index=True)
-#     return df
diff --git a/mlpf/pytorch/model.py b/mlpf/pytorch/model.py
deleted file mode 100755
index 1d4c6b242..000000000
--- a/mlpf/pytorch/model.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import numpy as np
-import mplhep
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.data import Data, DataListLoader, Batch
-from torch.utils.data import random_split
-
-from gravnet import GravNetConv
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=32, encoding_dim=256,
-        output_dim_id=6,
-        output_dim_p4=6,
-        convlayer="gravnet-radius",
-        convlayer2="none",
-        space_dim=2, nearest=3, dropout_rate=0.0, activation="leaky_relu", return_edges=False, radius=0.1, input_encoding=0):
-
-        super(PFNet7, self).__init__()
-
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        self.return_edges = return_edges
-        self.convlayer = convlayer
-        self.input_encoding = input_encoding
-
-        self.act = nn.LeakyReLU
-        self.act_f = torch.nn.functional.leaky_relu
-
-        # (1) GNN layer
-        if convlayer == "gravnet-knn":
-            self.conv1 = GravNetConv(input_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="knn")
-        elif convlayer == "gravnet-radius":
-            self.conv1 = GravNetConv(input_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="radius", radius=radius)
-        else:
-            raise Exception("Unknown convolution layer: {}".format(convlayer))
-
-        # (2) another GNN layer if you want
-        self.convlayer2 = convlayer2
-        if convlayer2 == "none":
-            self.conv2_1 = None
-            self.conv2_2 = None
-
-        # (3) dropout layer if you want
-        self.dropout1 = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-
-        # (4) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(encoding_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (5) DNN layer: regressing p4
-        self.nn3 = nn.Sequential(
-            nn.Linear(encoding_dim + output_dim_id, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_p4),
-        )
-
-    def forward(self, data):
-
-        #encode the inputs (x is of shape [~5000*batch_size, input_dim])
-        x = data.x
-
-        #Run a clustering of the inputs that returns the new_edge_index.. this is the KNN step..
-        # new_edge_index is of shape [2, big#]
-        # x & x1 are of shape [~5000*batch_size, encoding_dim]
-        new_edge_index, x = self.conv1(x)
-        x1 = self.act_f(x)                 # act by nonlinearity
-
-        #Decode convolved graph nodes to PID (after a dropout)
-        # cand_ids is of shape [~5000*batch_size, 6]
-        cand_ids = self.nn2(self.dropout1(x1))
-
-        #Decode convolved graph nodes to p4
-        # (1) add the predicted PID along as it may help (why we concatenate)
-        nn3_input = torch.cat([x1, cand_ids], axis=-1)
-        # (2) pass them both to the NN
-        cand_p4 = self.nn3(self.dropout1(nn3_input))
-
-        return cand_ids, cand_p4, new_edge_index
-
-
-# -------------------------------------------------------------------------------------
-# # test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import from_data_to_loader
-#
-# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/delphes_cfi')
-#
-# train_loader, valid_loader = from_data_to_loader(full_dataset, n_train=2, n_val=1, batch_size=1 )
-#
-# print(next(iter(train_loader)))
-#
-# model = PFNet7()
-#
-# for batch in train_loader:
-#     cand_id_onehot, cand_momentum, new_edge_index = model(batch)
-#     break
-#
-# batch
-# print(cand_id_onehot.shape)
-# print(cand_momentum.shape)
-# print(new_edge_index.shape)
-# print(new_edge_index)
diff --git a/mlpf/pytorch/model_general.py b/mlpf/pytorch/model_general.py
deleted file mode 100755
index 6f25f25ce..000000000
--- a/mlpf/pytorch/model_general.py
+++ /dev/null
@@ -1,177 +0,0 @@
-import sys
-import os
-import math
-
-from comet_ml import Experiment
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from gravnet import GravNetConv
-from torch_geometric.data import Data, DataListLoader, Batch
-from torch.utils.data import random_split
-
-import torch_cluster
-
-from glob import glob
-import numpy as np
-import os.path as osp
-import pickle
-
-import math
-import time
-import numba
-import tqdm
-import sklearn
-import pandas
-
-import mplhep
-
-from sklearn.metrics import accuracy_score
-
-import graph_data
-from graph_data import PFGraphDataset, elem_to_id, class_to_id, class_labels
-from sklearn.metrics import confusion_matrix
-
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=32, encoding_dim=256,
-        output_dim_id=6,
-        output_dim_p4=6,
-        convlayer="gravnet-radius",
-        convlayer2="none",
-        space_dim=2, nearest=3, dropout_rate=0.0, activation="leaky_relu", return_edges=False, radius=0.1, input_encoding=0):
-
-        super(PFNet7, self).__init__()
-
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        self.return_edges = return_edges
-        self.convlayer = convlayer
-        self.input_encoding = input_encoding
-
-        if activation == "leaky_relu":
-            self.act = nn.LeakyReLU
-            self.act_f = torch.nn.functional.leaky_relu
-        elif activation == "selu":
-            self.act = nn.SELU
-            self.act_f = torch.nn.functional.selu
-        elif activation == "relu":
-            self.act = nn.ReLU
-            self.act_f = torch.nn.functional.relu
-
-        # if you want to add an initial encoding of the input
-        conv_in_dim = input_dim
-        if self.input_encoding>0:
-            self.nn1 = nn.Sequential(
-                nn.Linear(input_dim, hidden_dim),
-                self.act(),
-                nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-                nn.Linear(hidden_dim, hidden_dim),
-                self.act(),
-                nn.Linear(hidden_dim, encoding_dim),
-            )
-            conv_in_dim = encoding_dim
-
-        # (1) GNN layer
-        if convlayer == "gravnet-knn":
-            self.conv1 = GravNetConv(conv_in_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="knn")
-        elif convlayer == "gravnet-radius":
-            self.conv1 = GravNetConv(conv_in_dim, encoding_dim, space_dim, hidden_dim, nearest, neighbor_algo="radius", radius=radius)
-        else:
-            raise Exception("Unknown convolution layer: {}".format(convlayer))
-
-        #decoding layer receives the raw inputs and the gravnet output
-        num_decode_in = input_dim + encoding_dim
-
-        # (2) another GNN layer if you want
-        self.convlayer2 = convlayer2
-        if convlayer2 == "none":
-            self.conv2_1 = None
-            self.conv2_2 = None
-        elif convlayer2 == "sgconv":
-            self.conv2_1 = SGConv(num_decode_in, hidden_dim, K=1)
-            self.conv2_2 = SGConv(num_decode_in, hidden_dim, K=1)
-            num_decode_in += hidden_dim
-        elif convlayer2 == "graphunet":
-            self.conv2_1 = GraphUNet(num_decode_in, hidden_dim, hidden_dim, 2, pool_ratios=0.1)
-            self.conv2_2 = GraphUNet(num_decode_in, hidden_dim, hidden_dim, 2, pool_ratios=0.1)
-            num_decode_in += hidden_dim
-        elif convlayer2 == "gatconv":
-            self.conv2_1 = GATConv(num_decode_in, hidden_dim, 4, concat=False, dropout=dropout_rate)
-            self.conv2_2 = GATConv(num_decode_in, hidden_dim, 4, concat=False, dropout=dropout_rate)
-            num_decode_in += hidden_dim
-        else:
-            raise Exception("Unknown convolution layer: {}".format(convlayer2))
-
-        # (3) dropout layer if you want
-        self.dropout1 = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
-
-        # (4) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(num_decode_in, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (5) DNN layer: regressing p4
-        self.nn3 = nn.Sequential(
-            nn.Linear(num_decode_in + output_dim_id, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(),
-            nn.Linear(hidden_dim, output_dim_p4),
-        )
-
-    def forward(self, data):
-
-        #encode the inputs
-        x = data.x
-
-        if self.input_encoding:
-            x = self.nn1(x)
-
-        #Run a clustering of the inputs that returns the new_edge_index.. this is the KNN step..
-        new_edge_index, x = self.conv1(x)
-        x1 = self.act_f(x)
-
-        #run a second convolution
-        if self.convlayer2 != "none":
-            conv2_input = torch.cat([data.x, x1], axis=-1)
-            x2_1 = self.act_f(self.conv2_1(conv2_input, new_edge_index))
-            x2_2 = self.act_f(self.conv2_2(conv2_input, new_edge_index))
-            nn2_input = torch.cat([data.x, x1, x2_1], axis=-1)
-        else:
-            nn2_input = torch.cat([data.x, x1], axis=-1)
-
-        #Decode convolved graph nodes to pdgid and p4
-        cand_ids = self.nn2(self.dropout1(nn2_input))
-
-        if self.convlayer2 != "none":
-            nn3_input = torch.cat([data.x, x1, x2_2, cand_ids], axis=-1)
-        else:
-            nn3_input = torch.cat([data.x, x1, cand_ids], axis=-1)
-
-        cand_p4 = data.x[:, len(elem_to_id):len(elem_to_id)+4] + self.nn3(self.dropout1(nn3_input))
-        return cand_ids, cand_p4, new_edge_index
diff --git a/mlpf/pytorch/train_end2end_delphes.py b/mlpf/pytorch/train_end2end_delphes.py
deleted file mode 100755
index 706373d72..000000000
--- a/mlpf/pytorch/train_end2end_delphes.py
+++ /dev/null
@@ -1,437 +0,0 @@
-import sys
-import os
-
-from comet_ml import Experiment
-
-#Check if the GPU configuration has been provided
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from gravnet import GravNetConv
-from torch_geometric.data import Data, DataListLoader, Batch
-from torch.utils.data import random_split
-
-import torch_cluster
-
-from glob import glob
-import numpy as np
-import os.path as osp
-import pickle
-import math
-import time
-import tqdm
-import sklearn
-import pandas
-
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from sklearn.metrics import accuracy_score
-from sklearn.metrics import confusion_matrix
-
-from model import PFNet7
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import from_data_to_loader
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, lr, target_type):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_{}__npar_{}__cfg_{}__user_{}__ntrain_{}__lr_{}__{}'.format(
-        model_name,
-        dataset.split("/")[-1],
-        target_type,
-        model_params,
-        model_cfghash,
-        model_user,
-        n_train,
-        lr, int(time.time()))
-    return model_fname
-
-model_classes = {
-    "PFNet7": PFNet7,
-}
-
-def mse_loss(input, target):
-    return torch.sum((input - target) ** 2)
-
-def weighted_mse_loss(input, target, weight):
-    return torch.sum(weight * (input - target).sum(axis=1) ** 2)
-
-def compute_weights(target_ids, device):
-    vs, cs = torch.unique(target_ids, return_counts=True)
-    weights = torch.zeros(output_dim_id).to(device=device)
-    for k, v in zip(vs, cs):
-        weights[k] = 1.0/math.sqrt(float(v))
-    return weights
-
-@torch.no_grad()
-def test(model, loader, epoch, l1m, l2m, l3m, target_type):
-    with torch.no_grad():
-        ret = train(model, loader, epoch, None, l1m, l2m, l3m, target_type, None)
-    return ret
-
-
-def train(model, loader, epoch, optimizer, l1m, l2m, l3m, target_type, scheduler):
-
-    is_train = not (optimizer is None)
-
-    if is_train:
-        model.train()
-    else:
-        model.eval()
-
-    #loss values for each batch: classification, regression
-    losses = np.zeros((len(loader), 3))
-
-    #accuracy values for each batch (monitor classification performance)
-    accuracies_batch = np.zeros(len(loader))
-
-    #correlation values for each batch (monitor regression performance)
-    corrs_batch = np.zeros(len(loader))
-
-    #epoch confusion matrix
-    conf_matrix = np.zeros((output_dim_id, output_dim_id))
-
-    #keep track of how many data points were processed
-    num_samples = 0
-
-    for i, batch in enumerate(loader):
-        t0 = time.time()
-
-        if not multi_gpu:
-            batch = batch.to(device)
-
-        if is_train:
-            optimizer.zero_grad()
-
-        # forward pass
-        cand_id_onehot, cand_momentum, new_edge_index = model(batch)
-
-        _dev = cand_id_onehot.device                   # store the device in dev
-        _, indices = torch.max(cand_id_onehot, -1)     # picks the maximum PID location and stores the index (opposite of one_hot_embedding)
-
-        num_samples += len(cand_id_onehot)
-
-        # concatenate ygen/ycand over the batch to compare with the truth label
-        # now: ygen/ycand is of shape [~5000*batch_size, 6] corresponding to the output of the forward pass
-        if args.target == "gen":
-            target_ids = batch.ygen_id
-            target_p4 = batch.ygen
-        elif args.target == "cand":
-            target_ids = batch.ycand_id
-            target_p4 = batch.ycand
-
-        #Predictions where both the predicted and true class label was nonzero
-        #In these cases, the true candidate existed and a candidate was predicted
-        # target_ids_msk reverts the one_hot_embedding
-        # msk is a list of booleans of shape [~5000*batch_size] where each boolean correspond to whether a candidate was predicted
-        _, target_ids_msk = torch.max(target_ids, -1)
-        msk = ((indices != 0) & (target_ids_msk != 0)).detach().cpu()
-        msk2 = ((indices != 0) & (indices == target_ids_msk))
-
-        accuracies_batch[i] = accuracy_score(target_ids_msk[msk].detach().cpu().numpy(), indices[msk].detach().cpu().numpy())
-
-        # a manual rescaling weight given to each class
-        weights = compute_weights(torch.max(target_ids,-1)[1], _dev)
-
-        #Loss for output candidate id (multiclass)
-        l1 = l1m * torch.nn.functional.cross_entropy(target_ids, indices, weight=weights)
-
-        #Loss for candidate p4 properties (regression)
-        l2 = l2m * torch.nn.functional.mse_loss(target_p4[msk2], cand_momentum[msk2])
-
-        batch_loss = l1 + l2
-        losses[i, 0] = l1.item()
-        losses[i, 1] = l2.item()
-
-        if is_train:
-            batch_loss.backward()
-
-        batch_loss_item = batch_loss.item()
-        t1 = time.time()
-
-        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), batch_loss_item, t1-t0), end='\r', flush=True)
-        if is_train:
-            optimizer.step()
-            if not scheduler is None:
-                scheduler.step()
-
-        #Compute correlation of predicted and true pt values for monitoring
-        corr_pt = 0.0
-        if msk.sum()>0:
-            corr_pt = np.corrcoef(
-                cand_momentum[msk, 0].detach().cpu().numpy(),
-                target_p4[msk, 0].detach().cpu().numpy())[0,1]
-
-        corrs_batch[i] = corr_pt
-
-        conf_matrix += confusion_matrix(target_ids_msk.detach().cpu().numpy(),
-                                        np.argmax(cand_id_onehot.detach().cpu().numpy(),axis=1), labels=range(6))
-
-    corr = np.mean(corrs_batch)
-    acc = np.mean(accuracies_batch)
-    losses = losses.mean(axis=0)
-    return num_samples, losses, corr, acc, conf_matrix
-
-def train_loop():
-    t0_initial = time.time()
-
-    losses_train = np.zeros((args.n_epochs, 3))
-    losses_val = np.zeros((args.n_epochs, 3))
-
-    corrs = []
-    corrs_v = []
-    accuracies = []
-    accuracies_v = []
-    best_val_loss = 99999.9
-    stale_epochs = 0
-
-    print("Training over {} epochs".format(args.n_epochs))
-    for epoch in range(args.n_epochs):
-        t0 = time.time()
-
-        if stale_epochs > patience:
-            print("breaking due to stale epochs")
-            break
-
-        with experiment.train():
-            model.train()
-
-            num_samples_train, losses, c, acc, conf_matrix = train(model, train_loader, epoch, optimizer,
-                                                                   args.l1, args.l2, args.l3, args.target, scheduler)
-
-            experiment.log_metric('lr', optimizer.param_groups[0]['lr'], step=epoch)
-            l = sum(losses)
-            losses_train[epoch] = losses
-            corrs += [c]
-            accuracies += [acc]
-            experiment.log_metric('loss',l, step=epoch)
-            experiment.log_metric('loss1',losses[0], step=epoch)
-            experiment.log_metric('loss2',losses[1], step=epoch)
-            experiment.log_metric('loss3',losses[2], step=epoch)
-            experiment.log_metric('corrs',c, step=epoch)
-            experiment.log_metric('accuracy',acc, step=epoch)
-            experiment.log_confusion_matrix(matrix=conf_matrix, step=epoch,
-                                            title='Confusion Matrix Full',
-                                            file_name='confusion-matrix-full-train-%03d.json' % epoch,
-                                            labels = [str(c) for c in range(output_dim_id)])
-
-        with experiment.validate():
-            model.eval()
-            num_samples_val, losses_v, c_v, acc_v, conf_matrix_v = test(model, valid_loader, epoch,
-                                                                        args.l1, args.l2, args.l3, args.target)
-            l_v = sum(losses_v)
-            losses_val[epoch] = losses_v
-            corrs_v += [c_v]
-            accuracies_v += [acc_v]
-            experiment.log_metric('loss',l_v, step=epoch)
-            experiment.log_metric('loss1',losses_v[0], step=epoch)
-            experiment.log_metric('loss2',losses_v[1], step=epoch)
-            experiment.log_metric('loss3',losses_v[2], step=epoch)
-            experiment.log_metric('corrs',c_v, step=epoch)
-            experiment.log_metric('accuracy',acc_v, step=epoch)
-            experiment.log_confusion_matrix(matrix=conf_matrix_v, step=epoch,
-                                            title='Confusion Matrix Full',
-                                            file_name='confusion-matrix-full-val-%03d.json' % epoch,
-                                            labels = [str(c) for c in range(output_dim_id)])
-
-        if l_v < best_val_loss:
-            best_val_loss = l_v
-            stale_epochs = 0
-        else:
-            stale_epochs += 1
-
-        t1 = time.time()
-        epochs_remaining = args.n_epochs - epoch
-        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
-        experiment.log_metric('time_per_epoch', time_per_epoch, step=epoch)
-        eta = epochs_remaining*time_per_epoch/60
-
-        spd = (num_samples_val+num_samples_train)/time_per_epoch
-        losses_str = "[" + ",".join(["{:.4f}".format(x) for x in losses_v]) + "]"
-
-        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
-
-        print("epoch={}/{} dt={:.2f}s loss_train={:.5f} loss_valid={:.5f} c={:.2f}/{:.2f} a={:.6f}/{:.6f} partial_losses={} stale={} eta={:.1f}m spd={:.2f} samples/s lr={}".format(
-            epoch+1, args.n_epochs,
-            t1 - t0, l, l_v, c, c_v, acc, acc_v,
-            losses_str, stale_epochs, eta, spd, optimizer.param_groups[0]['lr']))
-
-    print('Done with training.')
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_train", type=int, default=2, help="number of data files to use for training.. each file contains 100 events")
-    parser.add_argument("--n_val", type=int, default=1, help="number of data files to use for validation.. each file contains 100 events")
-    parser.add_argument("--n_epochs", type=int, default=100, help="number of training epochs")
-    parser.add_argument("--patience", type=int, default=100, help="patience before early stopping")
-    parser.add_argument("--hidden_dim", type=int, default=32, help="hidden dimension")
-    parser.add_argument("--encoding_dim", type=int, default=256, help="encoded element dimension")
-    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
-    parser.add_argument("--model", type=str, choices=sorted(model_classes.keys()), help="type of model to use", default="PFNet6")
-    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="cand")
-    parser.add_argument("--dataset", type=str, help="Input dataset", required=True)
-    parser.add_argument("--outpath", type=str, default = 'experiments/', help="Output folder")
-    parser.add_argument("--activation", type=str, default='leaky_relu', choices=["selu", "leaky_relu", "relu"], help="activation function")
-    parser.add_argument("--optimizer", type=str, default='adam', choices=["adam", "adamw"], help="optimizer to use")
-    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
-    parser.add_argument("--l1", type=float, default=1.0, help="Loss multiplier for pdg-id classification")
-    parser.add_argument("--l2", type=float, default=0.001, help="Loss multiplier for momentum regression")
-    parser.add_argument("--l3", type=float, default=1.0, help="Loss multiplier for clustering")
-    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout rate")
-    parser.add_argument("--radius", type=float, default=0.1, help="Radius-graph radius")
-    parser.add_argument("--convlayer", type=str, choices=["gravnet-knn", "gravnet-radius", "sgconv", "gatconv"], help="Convolutional layer", default="gravnet")
-    parser.add_argument("--convlayer2", type=str, choices=["sgconv", "graphunet", "gatconv", "none"], help="Convolutional layer", default="none")
-    parser.add_argument("--space_dim", type=int, default=2, help="Spatial dimension for clustering in gravnet layer")
-    parser.add_argument("--nearest", type=int, default=3, help="k nearest neighbors in gravnet layer")
-    parser.add_argument("--overwrite", action='store_true', help="overwrite if model output exists")
-    parser.add_argument("--disable_comet", action='store_true', help="disable comet-ml")
-    parser.add_argument("--input_encoding", type=int, help="use an input encoding layer", default=0)
-    parser.add_argument("--load", type=str, help="Load the weight file", required=False, default=None)
-    parser.add_argument("--scheduler", type=str, help="LR scheduler", required=False, default="none", choices=["none", "onecycle"])
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'n_train': 2, 'n_val': 1, 'n_epochs': 3, 'patience': 100, 'hidden_dim':32, 'encoding_dim': 256,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'cand', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar',
-    # 'outpath': 'experiments/', 'activation': 'leaky_relu', 'optimizer': 'adam', 'lr': 1e-4, 'l1': 1, 'l2': 0.001, 'l3': 1, 'dropout': 0.5,
-    # 'radius': 0.1, 'convlayer': 'gravnet-radius', 'convlayer2': 'none', 'space_dim': 2, 'nearest': 3, 'overwrite': True,
-    # 'disable_comet': True, 'input_encoding': 0, 'load': None, 'scheduler': 'none'})
-
-    # define the dataset
-    full_dataset = PFGraphDataset(args.dataset)
-
-    # constructs a loader from the data to iterate over batches
-    train_loader, valid_loader = from_data_to_loader(full_dataset, args.n_train, args.n_val, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'dropout_rate': args.dropout,
-                    'convlayer': args.convlayer,
-                    'convlayer2': args.convlayer2,
-                    'radius': args.radius,
-                    'space_dim': args.space_dim,
-                    'activation': args.activation,
-                    'nearest': args.nearest,
-                    'input_encoding': args.input_encoding}
-
-    #instantiate the model
-    model = model_class(**model_kwargs)
-    if args.load:
-        s1 = torch.load(args.load, map_location=torch.device('cpu'))
-        s2 = {k.replace("module.", ""): v for k, v in s1.items()}
-        model.load_state_dict(s2)
-
-    if multi_gpu:
-        model = torch_geometric.nn.DataParallel(model)
-
-    model.to(device)
-
-    model_fname = get_model_fname(args.dataset, model, args.n_train, args.lr, args.target)
-
-    # need your api key in a .comet.config file: see https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables
-    experiment = Experiment(project_name="particleflow", disabled=args.disable_comet)
-    experiment.set_model_graph(repr(model))
-    experiment.log_parameters(dict(model_kwargs, **{'model': args.model, 'lr':args.lr, 'model_fname': model_fname,
-                                                    'l1': args.l1, 'l2':args.l2,
-                                                    'n_train':args.n_train, 'target':args.target, 'optimizer': args.optimizer}))
-    outpath = osp.join(args.outpath, model_fname)
-    if osp.isdir(outpath):
-        if args.overwrite:
-            print("model output {} already exists, deleting it".format(outpath))
-            import shutil
-            shutil.rmtree(outpath)
-        else:
-            print("model output {} already exists, please delete it".format(outpath))
-            sys.exit(0)
-    try:
-        os.makedirs(outpath)
-    except Exception as e:
-        pass
-
-    with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
-        pickle.dump(model_kwargs, f,  protocol=pickle.HIGHEST_PROTOCOL)
-
-    if args.optimizer == "adam":
-        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-    elif args.optimizer == "adamw":
-        optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-
-    scheduler = None
-    if args.scheduler == "onecycle":
-        scheduler = torch.optim.lr_scheduler.OneCycleLR(
-            optimizer,
-            max_lr=args.lr,
-            steps_per_epoch=int(len(train_loader)),
-            epochs=args.n_epochs + 1,
-            anneal_strategy='linear',
-        )
-
-    print(model)
-    print(model_fname)
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    print("params", params)
-
-    model.train()
-
-    train_loop()
-    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
-    #     train_loop()
-
-    # print(prof.key_averages().table(sort_by="cuda_time_total"))
diff --git a/mlpf/pytorch_cms/README.md b/mlpf/pytorch_cms/README.md
new file mode 100644
index 000000000..19ab76c6e
--- /dev/null
+++ b/mlpf/pytorch_cms/README.md
@@ -0,0 +1,5 @@
+Short instructions to train on cms data:
+```bash
+cd ../..
+./scripts/local_test_cms.sh
+```
diff --git a/mlpf/pytorch/eval_end2end_cms.py b/mlpf/pytorch_cms/eval_end2end_cms.py
similarity index 100%
rename from mlpf/pytorch/eval_end2end_cms.py
rename to mlpf/pytorch_cms/eval_end2end_cms.py
diff --git a/mlpf/pytorch/graph_data_cms.py b/mlpf/pytorch_cms/graph_data_cms.py
similarity index 100%
rename from mlpf/pytorch/graph_data_cms.py
rename to mlpf/pytorch_cms/graph_data_cms.py
diff --git a/mlpf/pytorch/gravnet.py b/mlpf/pytorch_cms/gravnet.py
similarity index 100%
rename from mlpf/pytorch/gravnet.py
rename to mlpf/pytorch_cms/gravnet.py
diff --git a/mlpf/pytorch/train_end2end_cms.py b/mlpf/pytorch_cms/train_end2end_cms.py
old mode 100755
new mode 100644
similarity index 100%
rename from mlpf/pytorch/train_end2end_cms.py
rename to mlpf/pytorch_cms/train_end2end_cms.py
diff --git a/mlpf/pytorch_delphes/DDP_tutorial.py b/mlpf/pytorch_delphes/DDP_tutorial.py
new file mode 100644
index 000000000..fe96d296b
--- /dev/null
+++ b/mlpf/pytorch_delphes/DDP_tutorial.py
@@ -0,0 +1,166 @@
+import os
+import sys
+import tempfile
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+import torch.multiprocessing as mp
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+# On Windows platform, the torch.distributed package only
+# supports Gloo backend, FileStore and TcpStore.
+# For FileStore, set init_method parameter in init_process_group
+# to a local file. Example as follow:
+# init_method="file:///f:/libtmp/some_file"
+# dist.init_process_group(
+#    "gloo",
+#    rank=rank,
+#    init_method=init_method,
+#    world_size=world_size)
+# For TcpStore, same way as on Linux.
+
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+
+    # initialize the process group
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+def demo_basic(rank, world_size):
+    print(f"Running basic DDP example on rank {rank}.")
+    setup(rank, world_size)
+
+    # create model and move it to GPU with id rank
+    model = ToyModel().to(rank)
+    ddp_model = DDP(model, device_ids=[rank])
+
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+    optimizer.zero_grad()
+    outputs = ddp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(rank)
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+
+    cleanup()
+
+
+def run_demo(demo_fn, world_size):
+    mp.spawn(demo_fn,
+             args=(world_size,),
+             nprocs=world_size,
+             join=True)
+
+
+
+def demo_checkpoint(rank, world_size):
+    print(f"Running DDP checkpoint example on rank {rank}.")
+    setup(rank, world_size)
+
+    model = ToyModel().to(rank)
+    ddp_model = DDP(model, device_ids=[rank])
+
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+    CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
+    if rank == 0:
+        # All processes should see same parameters as they all start from same
+        # random parameters and gradients are synchronized in backward passes.
+        # Therefore, saving it in one process is sufficient.
+        torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
+
+    # Use a barrier() to make sure that process 1 loads the model after process
+    # 0 saves it.
+    dist.barrier()
+    # configure map_location properly
+    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
+    ddp_model.load_state_dict(
+        torch.load(CHECKPOINT_PATH, map_location=map_location))
+
+    optimizer.zero_grad()
+    outputs = ddp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(rank)
+    loss_fn = nn.MSELoss()
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+
+    # Not necessary to use a dist.barrier() to guard the file deletion below
+    # as the AllReduce ops in the backward pass of DDP already served as
+    # a synchronization.
+
+    if rank == 0:
+        os.remove(CHECKPOINT_PATH)
+
+    cleanup()
+
+
+
+class ToyMpModel(nn.Module):
+    def __init__(self, dev0, dev1):
+        super(ToyMpModel, self).__init__()
+        self.dev0 = dev0
+        self.dev1 = dev1
+        self.net1 = torch.nn.Linear(10, 10).to(dev0)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(10, 5).to(dev1)
+
+    def forward(self, x):
+        x = x.to(self.dev0)
+        x = self.relu(self.net1(x))
+        x = x.to(self.dev1)
+        return self.net2(x)
+
+
+
+def demo_model_parallel(rank, world_size):
+    print(f"Running DDP with model parallel example on rank {rank}.")
+    setup(rank, world_size)
+
+    # setup mp_model and devices for this process
+    dev0 = rank * 2
+    dev1 = rank * 2 + 1
+    mp_model = ToyMpModel(dev0, dev1)
+    ddp_mp_model = DDP(mp_model)
+
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
+
+    optimizer.zero_grad()
+    # outputs will be on dev1
+    outputs = ddp_mp_model(torch.randn(20, 10))
+    labels = torch.randn(20, 5).to(dev1)
+    loss_fn(outputs, labels).backward()
+    optimizer.step()
+
+    cleanup()
+
+
+if __name__ == "__main__":
+    n_gpus = torch.cuda.device_count()
+    if n_gpus < 2:
+      print(f"Requires at least 2 GPUs to run, but got {n_gpus}.")
+    else:
+      # run_demo(demo_basic, 2)
+      # run_demo(demo_checkpoint, 2)
+      # run_demo(demo_model_parallel, 1)
+
+      demo_basic((2,), 2)
diff --git a/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py b/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
new file mode 100644
index 000000000..15d38cc13
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
@@ -0,0 +1,258 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+import model_io
+from torch_geometric.utils import to_scipy_sparse_matrix
+import scipy
+import pickle, math, time
+import _pickle as cPickle
+from sys import getsizeof
+from tqdm import tqdm
+
+from torch_geometric.data import Data
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from torch_geometric.utils import to_dense_adj
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+class LRP_clf:
+    EPSILON=1e-9
+
+    def __init__(self,model:model_io):
+        self.model=model
+
+    def register_model(model:model_io):
+        self.model=model
+
+    """
+    LRP rules
+    """
+
+    # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
+    @staticmethod
+    def easy_rule(layer,input,R,index,output_layer,activation_layer, print_statement):
+        EPSILON=1e-9
+        # input.retain_grad()
+        # z = layer.forward(input)
+        # basically layer.forward does this: output=(torch.matmul(input,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).to(device)
+        else:
+            w = layer.weight.detach().to(device)
+
+        if output_layer: # for the output layer
+            T, W, r = [], [], []
+
+            for i in range(R.shape[1]):
+                T.append(R[:,i].reshape(-1,1).to(device))
+                W.append(w[i,:].reshape(1,-1).to(device))
+                I = torch.ones_like(R[:,i]).reshape(-1,1).to(device)
+
+                Numerator = (input*torch.matmul(T[i],W[i]))
+                Denominator = (input*torch.matmul(I,W[i])).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                r.append(torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            print('- Finished computing R-scores')
+            return r
+        else:
+            for i in range(len(R)):
+                I = torch.ones_like(R[i])
+
+                Numerator = (input*torch.matmul(R[i],w))
+                Denominator = (input*torch.matmul(I,w)).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                R[i]=(torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            print('- Finished computing R-scores')
+            return R
+
+
+    @staticmethod
+    def eps_rule(layer, input, R, index, output_layer, activation_layer, print_statement, adjacency_matrix=None, message_passing=False):
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).detach().to(device)
+        elif message_passing: # message passing hack
+            w = adjacency_matrix.detach().to(device)
+        else:
+            w = layer.weight.detach().to(device)
+
+        wt = torch.transpose(w,0,1)
+
+        if output_layer:
+            R_list = [None]*R.shape[1]
+            Wt = [None]*R.shape[1]
+            for output_neuron in range(R.shape[1]):
+                R_list[output_neuron] = (R[:,output_neuron].reshape(-1,1).clone())
+                Wt[output_neuron] = (wt[:,output_neuron].reshape(-1,1))
+        else:
+            R_list = R
+            Wt = [wt]*len(R_list)
+
+        R_previous=[None]*len(R_list)
+
+        for output_neuron in range(len(R_list)):
+
+            if message_passing: # message passing hack
+                R_list[output_neuron] = torch.transpose(R_list[output_neuron],0,1)
+
+            # rep stands for repeated/expanded
+            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(device)
+            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(device)
+
+            H = a_rep*wt_rep
+            deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,input.shape[1],-1)
+
+            G = H/deno
+
+            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(device)))
+            R_previous[output_neuron] = R_previous[output_neuron].reshape(R_previous[output_neuron].shape[0], R_previous[output_neuron].shape[1]).to('cpu')
+
+            if message_passing: # message passing hack
+                R_previous[output_neuron] = torch.transpose(R_previous[output_neuron],0,1)
+
+        if print_statement:
+            print('- Finished computing R-scores')
+            if message_passing:
+                if (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+            else:
+                if (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+
+        return R_previous
+
+    @staticmethod
+    def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weight, after_message, before_message, index, outpath, load_model):
+
+        # first time you hit message passing: construct and start filling the big tensor from scratch
+        if len(big_list)==0:
+            big_list = [[torch.zeros(R[0].shape[0],R[0].shape[1]) for i in range(len(R))] for i in range(R[0].shape[0])]
+            print('- Finished allocating memory for the big tensor of R-scores for all nodes')
+
+            for node_i in range(len(big_list)):
+                for output_neuron in range(len(big_list[0])):
+                    big_list[node_i][output_neuron][node_i] = R[output_neuron][node_i]
+            print('- Finished initializing the big tensor')
+
+        # build the adjacency matrix
+        A = to_dense_adj(edge_index, edge_attr=edge_weight)[0] # adjacency matrix
+
+        if torch.allclose(torch.matmul(A, before_message), after_message, rtol=1e-3):
+            print("- Adjacency matrix is correctly computed")
+
+        # # the following saves a version of the R-scores before the message passing
+        # torch.save(big_list, outpath + '/R_score_layer_before_msg_passing.pt')
+
+        # modify the big tensor based on message passing rule
+        for node_i in tqdm(range(len(big_list))):
+            big_list[node_i] = self.eps_rule(layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, adjacency_matrix=A, message_passing=True)
+            print(f'- Finished computing R-score for node {node_i+1}/{len(big_list)} for the message passing..')
+        print('- Finished computing R-scores for the message passing layer')
+        return big_list
+
+
+    """
+    explanation functions
+    """
+
+    def explain(self, to_explain):
+
+        start_index = self.model.n_layers                  ##########################
+        outpath = to_explain["outpath"]+'/'+to_explain["load_model"]
+
+        print('Total number of layers (including activation layers):', start_index)
+
+        # store the R-scores for the output layer (they are basically the model predictions)
+        torch.save(to_explain["pred_id"].detach(), outpath + f'/R_score_layer{start_index+1}.pt')
+
+        ### loop over each single layer
+        big_list = []
+        output_layer_index = start_index+1
+        for index in range(start_index+1, 1,-1):
+            if index==start_index+1:
+                R, big_list, output_layer_index  = self.explain_single_layer(to_explain["pred_id"].detach(), to_explain, big_list, outpath, output_layer_index, index)
+            else:
+                R, big_list, output_layer_index  = self.explain_single_layer(R, to_explain, big_list, outpath, output_layer_index, index)
+        print("Finished explaining all layers.")
+        return big_list      # returns the heatmaps for layer0 (i.e. input features)
+
+
+    def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_index, index=None, name=None):
+
+        layer = self.model.get_layer(index=index,name=name)
+
+        if name is None:
+            name = self.model.index2name(index)
+        if index is None:
+            index = self.model.name2index(name)
+
+        input = to_explain['A'][name].detach()
+
+        # skip the last DNN because this is only for classification
+        if 'nn3' in str(name):
+            print('Skipping a DNN3 regression layer..')
+            output_layer_index = output_layer_index-1
+            return R, big_list, output_layer_index
+
+        if index==output_layer_index:
+            output_layer_bool = True
+        else:
+            output_layer_bool = False
+
+        # it works out of the box that the conv1.lin_s layer which we don't care about is in the same place of the message passing.. so we can just replace its action
+        if 'conv1.lin_s' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: Message Passing")
+            big_list = self.message_passing_rule(self, layer, input, R, big_list, to_explain["edge_index"].detach(), to_explain["edge_weight"].detach(), to_explain["after_message"].detach(), to_explain["before_message"].detach(), index, outpath, to_explain["load_model"])
+            return R, big_list, output_layer_index
+
+        print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+
+        if len(big_list)==0:  # if you haven't hit the message passing step yet
+            if 'Linear' in str(layer):
+                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+            elif 'LeakyReLU' or 'ELU' in str(layer):
+                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
+        else:
+            for node_i in tqdm(range(len(big_list))):
+                if 'Linear' in str(layer):
+                    big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                elif 'LeakyReLU' or 'ELU' in str(layer):
+                    big_list[node_i] =  self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
+        return R, big_list, output_layer_index
+
+##-----------------------------------------------------------------------------
+# # big_list is a list of length 5k
+# # each element is another list of length 6 (corresponding to each of the output pid probability prediction)
+# # each element of that second list is a tensor of shape (5k,x) where x is the dimension of the latent space
diff --git a/mlpf/pytorch_delphes/LRP/LRP_dnn.py b/mlpf/pytorch_delphes/LRP/LRP_dnn.py
new file mode 100644
index 000000000..770d957ff
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/LRP_dnn.py
@@ -0,0 +1,168 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+import model_io
+from torch_geometric.utils import to_scipy_sparse_matrix
+import scipy
+import pickle, math, time
+import _pickle as cPickle
+
+from torch_geometric.data import Data
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+class LRP:
+    EPSILON=1e-9
+
+    def __init__(self,model:model_io):
+        self.model=model
+
+    def register_model(model:model_io):
+        self.model=model
+
+    """
+    LRP rules
+    """
+    @staticmethod
+    def eps_rule(layer, input, R, index, output_layer, activation_layer):
+
+        EPSILON=1e-9
+        a=copy_tensor(input)
+        a.retain_grad()
+        z = layer.forward(a)
+        # basically layer.forward does this: output=(torch.matmul(a,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
+
+        if activation_layer:
+            w = torch.eye(a.shape[1])
+        else:
+            w = layer.weight
+            b = layer.bias
+
+        wt = torch.transpose(w,0,1)
+
+        if output_layer:
+            R_list = [None]*R.shape[1]
+            Wt = [None]*R.shape[1]
+            for output_node in range(R.shape[1]):
+                R_list[output_node]=(R[:,output_node].reshape(-1,1).clone())
+                Wt[output_node]=(wt[:,output_node].reshape(-1,1))
+        else:
+            R_list = R
+            Wt = [wt]*len(R_list)
+
+        R_previous=[None]*len(R_list)
+        for output_node in range(len(R_list)):
+            # rep stands for repeated
+            a_rep = a.reshape(a.shape[0],a.shape[1],1).expand(-1,-1,R_list[output_node].shape[1])
+            wt_rep = Wt[output_node].reshape(1,Wt[output_node].shape[0],Wt[output_node].shape[1]).expand(a.shape[0],-1,-1)
+
+            H = a_rep*wt_rep
+            deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,a.shape[1],-1).float()
+
+            G = H/deno
+
+            R_previous[output_node] = (torch.matmul(G, R_list[output_node].reshape(R_list[output_node].shape[0],R_list[output_node].shape[1],1).float()))
+            R_previous[output_node] = R_previous[output_node].reshape(R_previous[output_node].shape[0], R_previous[output_node].shape[1])
+
+            print('- Finished computing R-scores for output neuron #: ', output_node+1)
+
+        print(f'- Completed layer: {layer}')
+        if (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1))):
+            print('- R score is conserved up to relative tolerance 1e-5')
+        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-4)):
+            print('- R score is conserved up to relative tolerance 1e-4')
+        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-3)):
+            print('- R score is conserved up to relative tolerance 1e-3')
+        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-2)):
+            print('- R score is conserved up to relative tolerance 1e-2')
+        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-1)):
+            print('- R score is conserved up to relative tolerance 1e-1')
+
+        return R_previous
+
+    """
+    explanation functions
+    """
+
+    def explain(self,
+                to_explain:dict,
+                save:bool=True,
+                save_to:str="./relevance.pt",
+                sort_nodes_by:int=0,
+                signal=torch.tensor([1,0,0,0,0,0],dtype=torch.float32).to(device),
+                return_result:bool=False):
+
+        start_index = self.model.n_layers                  ##########################
+        print('Total number of layers (including activation layers):', start_index)
+
+        ### loop over each single layer
+        for index in range(start_index+1, 1, -1):
+            print(f"Explaining layer {1+start_index+1-index}/{start_index+1-1}")
+            if index==start_index+1:
+                R = self.explain_single_layer(to_explain["pred"], to_explain, start_index+1, index)
+            else:
+                R = self.explain_single_layer(R, to_explain, start_index+1, index)
+
+            with open(to_explain["outpath"]+'/'+to_explain["load_model"]+f'/R_score_layer{index}.pkl', 'wb') as f:
+                cPickle.dump(R, f, protocol=4)
+
+        print("Finished explaining all layers.")
+
+    def explain_single_layer(self, R, to_explain, output_layer_index, index=None,name=None):
+
+        # preparing variables required for computing LRP
+        layer=self.model.get_layer(index=index,name=name)
+
+        if name is None:
+            name=self.model.index2name(index)
+        if index is None:
+            index=self.model.name2index(name)
+
+        input=to_explain['A'][name]
+
+        if index==output_layer_index:
+            output_layer_bool=True
+        else:
+            output_layer_bool=False
+
+        # backward pass with specified LRP rule
+        if 'Linear' in str(layer):
+            R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False)
+        elif 'LeakyReLU' or 'ELU' in str(layer):
+            R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=True)
+
+        return R
+
+def copy_tensor(tensor,dtype=torch.float32):
+    """
+    create a deep copy of the provided tensor,
+    outputs the copy with specified dtype
+    """
+
+    return tensor.clone().detach().requires_grad_(True).to(device)
+
+
+##-----------------------------------------------------------------------------
+#
+# arep=torch.transpose(a[0].repeat(6, 1),0,1)   # repeat it 6 times
+# H=arep*wt
+#
+# G = H/H.sum(axis=0).float()
+#
+# Num = torch.matmul(G, R[0].float())
+#
+# print('Num.sum()', Num.sum())
+#
+# print(R[0].sum())
diff --git a/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py b/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
new file mode 100644
index 000000000..89765a204
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
@@ -0,0 +1,319 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+import model_io
+from torch_geometric.utils import to_scipy_sparse_matrix
+import scipy
+import pickle, math, time
+import _pickle as cPickle
+from sys import getsizeof
+from tqdm import tqdm
+
+from torch_geometric.data import Data
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from torch_geometric.utils import to_dense_adj
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+class LRP_reg:
+    EPSILON=1e-9
+
+    def __init__(self,model:model_io):
+        self.model=model
+
+    def register_model(model:model_io):
+        self.model=model
+
+    """
+    LRP rules
+    """
+
+    # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
+    @staticmethod
+    def easy_rule(layer,input,R,index,output_layer,activation_layer, print_statement, skip_connection=False, adjacency_matrix=False, message_passing=False):
+        EPSILON=1e-9
+        # input.retain_grad()
+        # z = layer.forward(input)
+        # basically layer.forward does this: output=(torch.matmul(input,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).to(device)
+        else:
+            w = layer.weight.detach().to(device)
+
+        if output_layer: # for the output layer
+            T, W, r = [], [], []
+
+            for i in range(R.shape[1]):
+                T.append(R[:,i].reshape(-1,1).to(device))
+                W.append(w[i,:].reshape(1,-1).to(device))
+                I = torch.ones_like(R[:,i]).reshape(-1,1).to(device)
+
+                Numerator = (input*torch.matmul(T[i],W[i]))
+                Denominator = (input*torch.matmul(I,W[i])).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                r.append(torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            if print_statement:
+                print('- Finished computing R-scores')
+            return r
+        else:
+            for i in range(len(R)):
+                I = torch.ones_like(R[i])
+
+                Numerator = (input*torch.matmul(R[i],w))
+                Denominator = (input*torch.matmul(I,w)).sum(axis=1)
+
+                Denominator = Denominator.reshape(-1,1).expand(Denominator.size()[0],Numerator.size()[1])
+                R[i] = (torch.abs(Numerator / (Denominator+EPSILON*torch.sign(Denominator))))
+
+            if skip_connection:
+                input_relevance, pid_relevance, embedding_relevance = [None]*len(R), [None]*len(R), [None]*len(R)
+                for output_neuron in range(len(R)):
+                    input_relevance[output_neuron] = R[output_neuron][:,:12]
+                    pid_relevance[output_neuron] = R[output_neuron][:,12:18]
+                    embedding_relevance[output_neuron] = R[output_neuron][:,18:]
+                return input_relevance, pid_relevance, embedding_relevance
+
+            if print_statement:
+                print('- Finished computing R-scores')
+            return R
+
+
+    @staticmethod
+    def eps_rule(layer, input, R, index, output_layer, activation_layer, print_statement, skip_connection=False, adjacency_matrix=None, message_passing=False):
+
+        if activation_layer:
+            w = torch.eye(input.shape[1]).detach().to(device)
+        elif message_passing: # message passing hack
+            w = adjacency_matrix.detach().to(device)
+        else:
+            w = layer.weight.detach().to(device)
+
+        wt = torch.transpose(w,0,1)
+
+        if output_layer:
+            R_list = [None]*R.shape[1]
+            Wt = [None]*R.shape[1]
+            for output_neuron in range(R.shape[1]):
+                R_list[output_neuron] = (R[:,output_neuron].reshape(-1,1).clone())
+                Wt[output_neuron] = (wt[:,output_neuron].reshape(-1,1))
+        else:
+            R_list = R
+            Wt = [wt]*len(R_list)
+
+        R_previous=[None]*len(R_list)
+
+        for output_neuron in range(len(R_list)):
+
+            if message_passing: # message passing hack
+                R_list[output_neuron] = torch.transpose(R_list[output_neuron],0,1)
+
+            # rep stands for repeated/expanded
+            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(device)
+            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(device)
+
+            H = a_rep*wt_rep
+            deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,input.shape[1],-1)
+
+            G = H/deno
+
+            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(device)))
+            R_previous[output_neuron] = R_previous[output_neuron].reshape(R_previous[output_neuron].shape[0], R_previous[output_neuron].shape[1]).to('cpu')
+
+            if message_passing: # message passing hack
+                R_previous[output_neuron] = torch.transpose(R_previous[output_neuron],0,1)
+
+        if print_statement:
+            print('- Finished computing R-scores')
+            if message_passing:
+                if (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(torch.transpose(R_previous[output_neuron],0,1).sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+            else:
+                if (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1))):
+                    print('- R score is conserved up to relative tolerance 1e-5')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-4)):
+                    print('- R score is conserved up to relative tolerance 1e-4')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-3)):
+                    print('- R score is conserved up to relative tolerance 1e-3')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-2)):
+                    print('- R score is conserved up to relative tolerance 1e-2')
+                elif (torch.allclose(R_previous[output_neuron].sum(axis=1), R_list[output_neuron].to('cpu').sum(axis=1), rtol=1e-1)):
+                    print('- R score is conserved up to relative tolerance 1e-1')
+
+        if skip_connection:
+            input_relevance, pid_relevance, embedding_relevance = [None]*len(R_list), [None]*len(R_list), [None]*len(R_list)
+            for output_neuron in range(len(R_list)):
+                input_relevance[output_neuron] = R_previous[output_neuron][:,:12]
+                pid_relevance[output_neuron] = R_previous[output_neuron][:,12:18]
+                embedding_relevance[output_neuron] = R_previous[output_neuron][:,18:]
+            return input_relevance, pid_relevance, embedding_relevance
+
+        return R_previous
+
+    @staticmethod
+    def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weight, after_message, before_message, index, outpath, load_model):
+
+        # first time you hit message passing: construct and start filling the big tensor from scratch
+        if len(big_list)==0:
+            # big_list = [[torch.zeros(R[0].shape[0],R[0].shape[1])]*len(R)]*R[0].shape[0]   # this is wrong but it's faster for debugging (the correct way is the following line)
+            big_list = [[torch.zeros(R[0].shape[0],R[0].shape[1]) for i in range(len(R))] for i in range(R[0].shape[0])]
+            print('- Finished allocating memory for the big tensor of R-scores for all nodes')
+
+            for node_i in range(len(big_list)):
+                for output_neuron in range(len(big_list[0])):
+                    big_list[node_i][output_neuron][node_i] = R[output_neuron][node_i]
+            print('- Finished initializing the big tensor')
+
+        # build the adjacency matrix
+        A = to_dense_adj(edge_index, edge_attr=edge_weight)[0] # adjacency matrix
+
+        if torch.allclose(torch.matmul(A, before_message), after_message, rtol=1e-3):
+            print("- Adjacency matrix is correctly computed")
+
+        # # the following saves a version of the R-scores before the message passing
+        # torch.save(big_list, outpath + '/R_score_layer_before_msg_passing.pt')
+
+        # modify the big tensor based on message passing rule
+        for node_i in tqdm(range(len(big_list))):
+            big_list[node_i] = self.eps_rule(layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, skip_connection=False, adjacency_matrix=A, message_passing=True)
+            print(f'- Finished computing R-score for node {node_i+1}/{len(big_list)} for the message passing..')
+        print('- Finished computing R-scores for the message passing layer')
+        return big_list
+
+
+    """
+    explanation functions
+    """
+
+    def explain(self,
+                to_explain:dict,
+                save:bool=True,
+                save_to:str="./relevance.pt",
+                sort_nodes_by:int=0,
+                signal=torch.tensor([1,0,0,0,0,0],dtype=torch.float32).to(device),
+                return_result:bool=False):
+
+        start_index = self.model.n_layers                  ##########################
+        outpath = to_explain["outpath"]+'/'+to_explain["load_model"]
+
+        print('Total number of layers (including activation layers):', start_index)
+
+        # store the R-scores for the output layer (they are basically the model predictions)
+        torch.save(to_explain["pred_p4"].detach(), outpath + f'/R_score_layer{start_index+1}.pt')
+
+        ### loop over each single layer
+        big_list = []
+        for index in range(start_index+1, 1,-1):
+            if index==start_index+1:
+                R, big_list  = self.explain_single_layer(to_explain["pred_p4"].detach(), to_explain, big_list, outpath, start_index+1, index)
+            else:
+                R, big_list  = self.explain_single_layer(R, to_explain, big_list, outpath, start_index+1, index)
+        print("Finished explaining all layers.")
+        return big_list      # returns the heatmaps for layer0 (i.e. input features)
+
+    def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_index, index=None, name=None):
+        # preparing variables required for computing LRP
+        layer = self.model.get_layer(index=index,name=name)
+
+        if name is None:
+            name = self.model.index2name(index)
+        if index is None:
+            index = self.model.name2index(name)
+
+        input = to_explain['A'][name].detach()
+
+        if index==output_layer_index:
+            output_layer_bool = True
+        else:
+            output_layer_bool = False
+
+        #### THERE ARE 4 SPECIAL LAYERS TO BE TREATED UNIQUELY
+        # (1) for skip connection purposes
+        if 'nn3.0' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer} - Skip connection")
+            input_relevance, pid_relevance, embedding_relevance = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True, skip_connection=True)
+
+            torch.save(input_relevance, outpath + f'/input_relevance.pt')
+            torch.save(embedding_relevance, outpath + f'/embedding_relevance.pt')
+
+            return pid_relevance, big_list
+
+        # (2) for skip connection purposes
+        if 'nn2.0' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+            R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+
+            # add the embedding_relevance computed in the nn3.0 skip connection
+            embedding_relevance = torch.load(outpath + f'/embedding_relevance.pt', map_location=torch.device('cpu'))
+
+            for i in range(len(R)):
+                R[i] = R[i] + embedding_relevance[i]
+
+            return R, big_list
+
+        # (3) for skip connection purposes
+        if 'nn1.0' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+
+            # add the input_relevance computed in the nn3.0 skip connection
+            input_relevance = torch.load(outpath + f'/input_relevance.pt', map_location=torch.device('cpu'))
+
+            for node_i in tqdm(range(len(big_list))):
+                big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                for i in range(len(R)):
+                    # for row in range(len(big_list[node_i][i])):
+                    #     # check if row is nonzero
+                    #     if big_list[node_i][i][row,:].sum()!=0:
+                    #         big_list[node_i][i][row,:] = big_list[node_i][i][row,:] + input_relevance[i][row,:]
+                    big_list[node_i][i][node_i,:] = big_list[node_i][i][node_i,:] + input_relevance[i][node_i,:]
+
+            return R, big_list
+
+        # (4) Message Passing: it works out of the box that the conv1.lin_s layer which we don't care about is in the same place of the message passing.. so we can just replace its action
+        if 'conv1.lin_s' in str(name):
+            print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: Message Passing")
+            big_list = self.message_passing_rule(self, layer, input, R, big_list, to_explain["edge_index"].detach(), to_explain["edge_weight"].detach(), to_explain["after_message"].detach(), to_explain["before_message"].detach(), index, outpath, to_explain["load_model"])
+            return R, big_list
+
+        # All the other layers:
+        print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
+
+        if len(big_list)==0:  # if you haven't hit the message passing step yet
+            if 'Linear' in str(layer):
+                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+            elif 'LeakyReLU' or 'ELU' in str(layer):
+                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
+        else:
+            # in this way: big_list is a list of length 5k (nodes) that contains a list of length 6 (output_neurons) that contains tensors (5k,x) which are the heatmap of R-scores
+            for node_i in tqdm(range(len(big_list))):
+                if 'Linear' in str(layer):
+                    big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                elif 'LeakyReLU' or 'ELU' in str(layer):
+                    big_list[node_i] =  self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
+        return R, big_list
+
+##-----------------------------------------------------------------------------
+# # big_list is a list of length 5k
+# # each element is another list of length 6 (corresponding to each of the output pid probability prediction)
+# # each element of that second list is a tensor of shape (5k,x) where x is the dimension of the latent space
diff --git a/mlpf/pytorch_delphes/LRP/gravnet_LRP.py b/mlpf/pytorch_delphes/LRP/gravnet_LRP.py
new file mode 100644
index 000000000..5a09981fd
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/gravnet_LRP.py
@@ -0,0 +1,123 @@
+from typing import Optional, Union
+from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
+
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+
+try:
+    from torch_cluster import knn
+except ImportError:
+    knn = None
+
+# copied it from pytorch_geometric source code
+# ADDED: retrieve edge_index, retrieve edge_weight
+# ADDED: retrieve before and after message MessagePassing
+# CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
+# REMOVED: skip connection
+class GravNetConv(MessagePassing):
+    r"""The GravNet operator from the `"Learning Representations of Irregular
+    Particle-detector Geometry with Distance-weighted Graph
+    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
+    dynamically constructed using nearest neighbors.
+    The neighbors are constructed in a learnable low-dimensional projection of
+    the feature space.
+    A second projection of the input feature space is then propagated from the
+    neighbors to each vertex using distance weights that are derived by
+    applying a Gaussian function to the distances.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        space_dimensions (int): The dimensionality of the space used to
+           construct the neighbors; referred to as :math:`S` in the paper.
+        propagate_dimensions (int): The number of features to be propagated
+           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
+           paper.
+        k (int): The number of nearest neighbors.
+        num_workers (int): Number of workers to use for k-NN computation.
+            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
+            lies on the GPU. (default: :obj:`1`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 space_dimensions: int, propagate_dimensions: int, k: int,
+                 num_workers: int = 1, **kwargs):
+        super(GravNetConv, self).__init__(flow='target_to_source', **kwargs)
+
+        if knn is None:
+            raise ImportError('`GravNetConv` requires `torch-cluster`.')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.k = k
+        self.num_workers = num_workers
+
+        self.lin_s = Linear(in_channels, space_dimensions)
+        self.lin_h = Linear(in_channels, propagate_dimensions)
+        self.lin_p = Linear(propagate_dimensions, out_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_s.reset_parameters()
+        self.lin_h.reset_parameters()
+        self.lin_p.reset_parameters()
+
+
+    def forward(
+            self, x: Union[Tensor, PairTensor],
+            batch: Union[OptTensor, Optional[PairTensor]] = None) -> Tensor:
+        """"""
+
+        is_bipartite: bool = True
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+            is_bipartite = False
+        assert x[0].dim() == 2, 'Static graphs not supported in `GravNetConv`.'
+
+        b: PairOptTensor = (None, None)
+        if isinstance(batch, Tensor):
+            b = (batch, batch)
+        elif isinstance(batch, tuple):
+            assert batch is not None
+            b = (batch[0], batch[1])
+
+        h_l: Tensor = self.lin_h(x[0])
+
+        s_l: Tensor = self.lin_s(x[0])
+        s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
+
+        edge_index = knn(s_l, s_r, self.k, b[0], b[1],
+                         num_workers=self.num_workers)
+
+        edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
+        edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
+
+        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
+        out = self.propagate(edge_index, x=(h_l, None),
+                             edge_weight=edge_weight,
+                             size=(s_l.size(0), s_r.size(0)))
+
+        return self.lin_p(out), edge_index, edge_weight, out, h_l
+
+
+    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
+        return x_j * edge_weight.unsqueeze(1)
+
+    def aggregate(self, inputs: Tensor, index: Tensor,
+                  dim_size: Optional[int] = None) -> Tensor:
+        out_mean = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                           reduce='sum')
+        out_max = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                          reduce='max')
+        # return torch.cat([out_mean, out_max], dim=-1)
+        return out_mean
+
+    def __repr__(self):
+        return '{}({}, {}, k={})'.format(self.__class__.__name__,
+                                         self.in_channels, self.out_channels,
+                                         self.k)
diff --git a/mlpf/pytorch_delphes/LRP/hooks.py b/mlpf/pytorch_delphes/LRP/hooks.py
new file mode 100644
index 000000000..6fc7fa279
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/hooks.py
@@ -0,0 +1,106 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle, math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+
+sys.path.insert(1, '../')
+sys.path.insert(1, '../../../plotting/')
+sys.path.insert(1, '../../../mlpf/plotting/')
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+import evaluate
+from evaluate import make_plots, Evaluate
+from plot_utils import plot_confusion_matrix
+from model_LRP import PFNet7
+
+from LRP import LRP
+from model_io import model_io
+import torch
+import torch.nn as nn
+
+activation={}
+def get_activation(name):
+    def hook(model, input, output):
+        activation[name] = input[0]
+    return hook
+
+class myNet(nn.Module):
+  def __init__(self):
+    super().__init__()
+    self.conv = nn.Conv2d(3,10,2, stride = 2)
+    self.relu = nn.ReLU()
+    self.flatten = lambda x: x.view(-1)
+    self.fc1 = nn.Linear(160,5)
+
+
+
+  def forward(self, x):
+    x = self.relu(self.conv(x))
+    x.register_hook(lambda grad : torch.clamp(grad, min = 0))     #No gradient shall be backpropagated
+                                                                  #conv outside less than 0
+
+    # print whether there is any negative grad
+    s=x.register_hook(lambda grad: torch.zeros(grad.shape))
+    return self.fc1(self.flatten(x))
+
+
+net = myNet()
+print(net)
+
+for name, param in net.named_parameters():
+  # if the param is from a linear and is a bias
+  if "fc" in name and "bias" in name:
+    param.register_hook(lambda grad: torch.zeros(grad.shape))
+
+
+out = net(torch.randn(1,3,8,8))
+
+(1 - out).mean().backward()
+
+print("The biases are", net.fc1.bias.grad)     #bias grads are zero
+
+
+print(s)
diff --git a/mlpf/pytorch_delphes/LRP/main_clf.py b/mlpf/pytorch_delphes/LRP/main_clf.py
new file mode 100644
index 000000000..5644e5c15
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/main_clf.py
@@ -0,0 +1,360 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from torch_geometric.utils import to_dense_adj
+
+sys.path.insert(1, '../')
+sys.path.insert(1, '../../../plotting/')
+sys.path.insert(1, '../../../mlpf/plotting/')
+
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+
+from model_LRP_reg import PFNet7
+from LRP_clf_gpu import LRP_clf
+from model_io import model_io
+
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+
+# NOTE: this script works by loading an already trained model
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        task,
+        title)
+    return model_fname
+
+def map_classid_to_classname(id):
+    if id==0:
+        return 'null'
+    if id==1:
+        return 'charged hadron'
+    if id==2:
+        return 'neutral hadron'
+    if id==3:
+        return 'photon'
+    if id==4:
+        return 'electron'
+    if id==5:
+        return 'muon'
+
+if __name__ == "__main__":
+
+    # args = parse_args()
+
+    # the next part initializes some args values (to run the script not from terminal)
+    class objectview(object):
+        def __init__(self, d):
+            self.__dict__ = d
+
+    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
+    # 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
+    # 'load_epoch': 9, 'load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'classification_only': True, 'nn1': True, 'conv2': False, 'nn3': False, 'title': '',
+    # 'explain': True, 'load': False, 'make_heatmaps': False})
+
+    args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
+    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../../test_tmp_delphes/data/pythia8_qcd',
+    'outpath': '../../../../test_tmp_delphes/experiments/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
+    'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
+    'load_epoch': 14, 'load_model': 'LRP_clf_PFNet7_gen_ntrain_1_nepochs_15_batch_size_1_lr_0.001_alpha_0.0002_clf_noskip_nn1',
+    'classification_only': True, 'nn1': True, 'conv2': False, 'nn3': False, 'title': '',
+    'explain': False, 'load': True, 'make_heatmaps': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'space_dim': args.space_dim,
+                    'propagate_dimensions': args.propagate_dimensions,
+                    'nearest': args.nearest}
+
+    print('Loading a previously trained model..')
+    model = model_class(**model_kwargs)
+    outpath = args.outpath + args.load_model
+    PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+    state_dict = torch.load(PATH, map_location=device)
+
+    # if model was trained using DataParallel then we have to load it differently
+    if "DataParallel" in args.load_model:
+        state_dict = torch.load(PATH, map_location=device)
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:] # remove module.
+            new_state_dict[name] = v
+            # print('name is:', name)
+        state_dict=new_state_dict
+
+    model.load_state_dict(state_dict)
+    model.to(device)
+
+    if args.explain:
+        model.eval()
+        print(model)
+
+        # create some hooks to retrieve intermediate activations
+        activation = {}
+        hooks={}
+
+        def get_activation(name):
+            def hook(model, input, output):
+                activation[name] = input[0]
+            return hook
+
+        for name, module in model.named_modules():
+            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
+                hooks[name] = module.register_forward_hook(get_activation("." + name))
+
+        for i, batch in enumerate(train_loader):
+
+            if multi_gpu:
+                X = batch
+            else:
+                X = batch.to(device)
+
+            if i==0:
+                # code can be written better
+                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
+                model = model_io(model,state_dict,dict(),activation)
+                explainer = LRP_clf(model)
+
+            else:
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
+
+            to_explain = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                         "y": gen_ids_one_hot, "pred": pred_ids_one_hot,
+                         "edge_index": edge_index, "edge_weight": edge_weight, "after_message": after_message, "before_message": before_message,
+                         "outpath": args.outpath, "load_model": args.load_model}
+
+            model.set_dest(to_explain["A"])
+
+            big_list = explainer.explain(to_explain)
+
+            torch.save(big_list, outpath + f'/big_list.pt')
+            torch.save(to_explain, outpath + f'/to_explain.pt')
+
+            break # explain only one single event
+
+    elif args.load:
+
+        big_list = torch.load(outpath + f'/big_list.pt', map_location=device)
+        to_explain = torch.load(outpath + f'/to_explain.pt', map_location=device)
+
+        gen_ids_one_hot = to_explain["y"]
+        pred_ids_one_hot = to_explain["pred"]
+        X = to_explain["inputs"]
+
+    if args.make_heatmaps:
+        # make directories to hold the heatmaps
+        print('Making heatmaps..')
+        for i in range(6):
+            if not osp.isdir(outpath + f'/class{str(i)}'):
+                os.makedirs(outpath + f'/class{str(i)}')
+            for j in range(6):
+                if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')
+
+        # make heatmaps
+        pred_ids = pred_ids_one_hot.argmax(axis=1)
+        gen_ids = gen_ids_one_hot.argmax(axis=1)
+
+        for output_neuron in range(output_dim_id):
+            list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
+            dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
+
+            for i,id in enumerate(gen_ids):
+                R_cat_feat_cat_pred = torch.cat([big_list[i][output_neuron].to(device), X['x'].to(device), pred_ids_one_hot.to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
+                if id==0:
+                    list0.append(R_cat_feat_cat_pred)
+                    dist0.append(i)
+                if id==1:
+                    list1.append(R_cat_feat_cat_pred)
+                    dist1.append(i)
+                if id==2:
+                    list2.append(R_cat_feat_cat_pred)
+                    dist2.append(i)
+                if id==3:
+                    list3.append(R_cat_feat_cat_pred)
+                    dist3.append(i)
+                if id==4:
+                    list4.append(R_cat_feat_cat_pred)
+                    dist4.append(i)
+                if id==5:
+                    list5.append(R_cat_feat_cat_pred)
+                    dist5.append(i)
+
+            list=[list0,list1,list2,list3,list4,list5]
+            dist=[dist0,dist1,dist2,dist3,dist4,dist5]
+
+            for pid in range(6):
+                for j in range(len(list[pid])): # iterating over the nodes in a graph
+                    # to keep non-zero rows
+                    non_empty_mask = list[pid][j][:,:12].abs().sum(dim=1).bool()
+                    harvest = list[pid][j][non_empty_mask,:]
+                    pos = dist[pid][j]
+
+                    def make_list(t):
+                        l = []
+                        for elem in t:
+                            if elem==1:
+                                l.append('cluster')
+                            if elem==2:
+                                l.append('track')
+                        return l
+
+                    node_types = make_list(harvest[:,12])
+
+                    ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
+                    if pid==1:
+                        features = ["type", " pt", "eta",
+                               "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
+                    else:
+                        features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "padding", "padding", "padding", "padding"]
+
+
+                    fig, ax = plt.subplots()
+                    fig.tight_layout()
+                    if pid==0:
+                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true null')
+                    if pid==1:
+                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true charged hadron')
+                    if pid==2:
+                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true neutral hadron')
+                    if pid==3:
+                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true photon')
+                    if pid==4:
+                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true electron')
+                    if pid==5:
+                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true muon')
+                    ax.set_xticks(np.arange(len(features)))
+                    ax.set_yticks(np.arange(len(node_types)))
+                    for col in range(len(features)):
+                        for row in range(len(node_types)):
+                            text = ax.text(col, row, round(harvest[row,12+col].item(),2),
+                                           ha="center", va="center", color="w")
+                    # ... and label them with the respective list entries
+                    ax.set_xticklabels(features)
+                    ax.set_yticklabels(node_types)
+                    plt.xlabel("\noutput prediction:{R} \nposition of node is row # {harvest}".format(R=[round(num,2) for num in harvest[j, 24:30].tolist()], harvest=((harvest[:,30] == pos).nonzero(as_tuple=True)[0].item()+1)))
+                    plt.imshow(torch.abs(harvest[:,:12]*10**7).detach().cpu().numpy(), interpolation="nearest", cmap='copper')
+                    plt.colorbar()
+                    fig.set_size_inches(19, 10)
+                    plt.savefig(outpath + f'/class{str(output_neuron)}'+f'/pid{str(pid)}'+f'/sample{str(j)}.jpg')
+                    plt.close(fig)
+
+                    if j==2:
+                        break
+
+
+# # ------------------------------------------------------------------------------------------------
+# # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:
+# print(R16[0].sum(axis=1)[0])
+# print(R15[0].sum(axis=1)[0])
+# print(R14[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R12[0].sum(axis=1)[0])
+# print(R11[0].sum(axis=1)[0])
+# print(R10[0].sum(axis=1)[0])
+# print(R9[0].sum(axis=1)[0])
+# print(R8[0].sum(axis=1)[0])
+# print(R_score_layer_before_msg_passing[0][0].sum(axis=0).sum())
+# print(R7[0][0].sum(axis=0).sum())
+# print(R6[0][0].sum(axis=1).sum())
+# print(R5[0][0].sum(axis=1).sum())
+# print(R4[0][0].sum(axis=1).sum())
+# print(R3[0][0].sum(axis=1).sum())
+# print(R2[0][0].sum(axis=1).sum())
+# print(R1[0][0].sum(axis=1).sum())
diff --git a/mlpf/pytorch_delphes/LRP/main_dnn.py b/mlpf/pytorch_delphes/LRP/main_dnn.py
new file mode 100644
index 000000000..afabace0f
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/main_dnn.py
@@ -0,0 +1,227 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+
+sys.path.insert(1, '../')
+sys.path.insert(1, '../../../plotting/')
+sys.path.insert(1, '../../../mlpf/plotting/')
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+
+from plot_utils import plot_confusion_matrix
+from model_LRP_dnn import PFNet7
+
+from LRP_dnn import LRP
+from model_io import model_io
+
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from tabulate import tabulate
+
+# NOTE: this script works by loading an already trained model
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        task,
+        title)
+    return model_fname
+
+def map_classid_to_classname(id):
+    if id==0:
+        return 'null'
+    if id==1:
+        return 'charged hadron'
+    if id==2:
+        return 'neutral hadron'
+    if id==3:
+        return 'photon'
+    if id==4:
+        return 'electron'
+    if id==5:
+        return 'muon'
+
+if __name__ == "__main__":
+
+    # args = parse_args()
+
+    # the next part initializes some args values (to run the script not from terminal)
+    class objectview(object):
+        def __init__(self, d):
+            self.__dict__ = d
+
+    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 10, 'patience': 100, 'hidden_dim':256, 'hidden_dim_nn1':64, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4, 'dropout': 0.3,
+    # 'space_dim': 8, 'propagate_dimensions': 22, 'nearest': 40, 'overwrite': True,
+    # 'load': True, 'load_epoch': 0, 'load_model': 'LRP_DNN_PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4',
+    # 'evaluate': False, 'evaluate_on_cpu': False, 'classification_only': False, 'nn1': True, 'nn3': True, 'nn4': True, 'title': 'dnn', 'explain': True})
+
+    args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 10, 'patience': 100, 'hidden_dim':256, 'hidden_dim_nn1':64, 'input_encoding': 12, 'encoding_dim': 64,
+    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../../test_tmp_delphes/data/pythia8_qcd',
+    'outpath': '../../../../test_tmp_delphes/experiments/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4, 'dropout': 0.3,
+    'space_dim': 8, 'propagate_dimensions': 22, 'nearest': 40, 'overwrite': True,
+    'load': True, 'load_epoch': 0, 'load_model': 'LRP_DNN_PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4',
+    'evaluate': False, 'evaluate_on_cpu': False, 'classification_only': False, 'nn1': True, 'nn3': True, 'nn4': True, 'title': 'dnn', 'explain': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4}
+
+    print('Loading a previously trained model..')
+    model = model_class(**model_kwargs)
+    outpath = args.outpath + args.load_model
+    PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+    state_dict = torch.load(PATH, map_location=device)
+
+    # if model was trained using DataParallel then we have to load it differently
+    if "DataParallel" in args.load_model:
+        state_dict = torch.load(PATH, map_location=device)
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:] # remove module.
+            new_state_dict[name] = v
+            # print('name is:', name)
+        state_dict=new_state_dict
+
+    model.load_state_dict(state_dict)
+
+    if args.explain:
+        model.eval()
+        print(model)
+
+        signal =torch.tensor([1,0,0,0,0,0],dtype=torch.float32).to(device)
+
+        # create some hooks to retrieve intermediate activations
+        activation = {}
+        hooks={}
+
+        def get_activation(name):
+            def hook(model, input, output):
+                activation[name] = input[0]
+            return hook
+
+        for name, module in model.named_modules():
+            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
+                hooks[name] = module.register_forward_hook(get_activation("." + name))
+            print(name)
+
+        for i, batch in enumerate(train_loader):
+            t0 = time.time()
+
+            if multi_gpu:
+                X = batch
+            else:
+                X = batch.to(device)
+
+            if i==0:
+                # code can be written better
+                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
+                pred_ids_one_hot, pred_p4, target_ids_one_hot, target_p4, cand_ids_one_hot, cand_p4 = model(X)
+                model=model_io(model,state_dict,dict(),activation)
+                explainer=LRP(model)
+
+            else:
+                pred_ids_one_hot, pred_p4, target_ids_one_hot, target_p4, cand_ids_one_hot, cand_p4 = model.model(X)
+
+            to_explain={"A":activation,"inputs":dict(x=X.x,
+                                                batch=X.batch),"y":target_ids_one_hot,"R":dict(), "pred":pred_ids_one_hot,
+                                                "outpath":args.outpath, "load_model":args.load_model}
+
+            model.set_dest(to_explain["A"])
+
+            explainer.explain(to_explain,save=False,return_result=True, signal=signal)
+
+            break
+
+## -----------------------------------------------------------
+# # to retrieve a stored variable in pkl file
+# import _pickle as cPickle
+# with open('../../../prp/models/LRP/LRP_DNN_PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4/R_scorez.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     s = cPickle.load(f)
diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
new file mode 100644
index 000000000..16a17ab63
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -0,0 +1,480 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+from torch_geometric.utils import to_dense_adj
+
+sys.path.insert(1, '../')
+sys.path.insert(1, '../../../plotting/')
+sys.path.insert(1, '../../../mlpf/plotting/')
+
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+
+from model_LRP_reg import PFNet7
+from LRP_clf_gpu import LRP_clf
+from LRP_reg_gpu import LRP_reg
+
+from model_io import model_io
+
+import networkx as nx
+from torch_geometric.utils.convert import to_networkx
+
+# NOTE: this script works by loading an already trained model
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        task,
+        title)
+    return model_fname
+
+def map_index_to_pid(id):
+    if id==0:
+        return 'null'
+    if id==1:
+        return 'charged hadron'
+    if id==2:
+        return 'neutral hadron'
+    if id==3:
+        return 'photon'
+    if id==4:
+        return 'electron'
+    if id==5:
+        return 'muon'
+
+def map_index_to_p4(index):
+    if index==0:
+        return 'charge'
+    if index==1:
+        return 'pt'
+    if index==2:
+        return 'eta'
+    if index==3:
+        return 'sin phi'
+    if index==4:
+        return 'cos phi'
+    if index==5:
+        return 'energy'
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'LRP_dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'LRP_dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
+    # 'LRP_outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
+    # 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
+    # 'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'explain': True, 'make_heatmaps_clf': True,'make_heatmaps_reg': True,
+    # 'clf': True, 'reg': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.LRP_dataset)
+    full_dataset_qcd = PFGraphDataset(args.LRP_dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'space_dim': args.space_dim,
+                    'propagate_dimensions': args.propagate_dimensions,
+                    'nearest': args.nearest}
+
+    print('Loading a previously trained model..')
+    model = model_class(**model_kwargs)
+    outpath = args.LRP_outpath + args.LRP_load_model
+    PATH = outpath + '/epoch_' + str(args.LRP_load_epoch) + '_weights.pth'
+
+    state_dict = torch.load(PATH, map_location=device)
+
+    # if model was trained using DataParallel then we have to load it differently
+    if "DataParallel" in args.LRP_load_model:
+        state_dict = torch.load(PATH, map_location=device)
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:] # remove module.
+            new_state_dict[name] = v
+            # print('name is:', name)
+        state_dict=new_state_dict
+
+    model.load_state_dict(state_dict)
+    model.to(device)
+
+    if args.explain:
+        model.eval()
+        print(model)
+
+        # create some hooks to retrieve intermediate activations
+        activation = {}
+        hooks={}
+
+        def get_activation(name):
+            def hook(model, input, output):
+                activation[name] = input[0]
+            return hook
+
+        for name, module in model.named_modules():
+            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
+                hooks[name] = module.register_forward_hook(get_activation("." + name))
+
+        for i, batch in enumerate(train_loader):
+
+            if multi_gpu:
+                X = batch
+            else:
+                X = batch.to(device)
+
+            if i==0:
+                # code can be written better
+                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
+                model = model_io(model,state_dict,dict(),activation)
+                explainer_reg = LRP_reg(model)
+                explainer_clf = LRP_clf(model)
+
+            else:
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
+
+            if args.LRP_reg:
+                print('Explaining the p4 predictions:')
+                to_explain_reg = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
+                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
+                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
+                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
+
+                model.set_dest(to_explain_reg["A"])
+
+                big_list_reg = explainer_reg.explain(to_explain_reg)
+                torch.save(big_list_reg, outpath + f'/big_list_reg.pt')
+                torch.save(to_explain_reg, outpath + f'/to_explain_reg.pt')
+
+            if args.LRP_clf:
+                print('Explaining the pid predictions:')
+                to_explain_clf = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
+                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
+                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
+                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
+
+                model.set_dest(to_explain_clf["A"])
+
+                big_list_clf = explainer_clf.explain(to_explain_clf)
+                torch.save(big_list_clf, outpath + f'/big_list_clf.pt')
+                torch.save(to_explain_clf, outpath + f'/to_explain_clf.pt')
+
+            break # explain only one single event
+
+    if args.make_heatmaps_reg:
+        print('Making heatmaps for regression..')
+
+        # load the necessary R-scores
+        big_list_reg = torch.load(outpath + f'/big_list_reg.pt', map_location=device)
+        to_explain_reg = torch.load(outpath + f'/to_explain_reg.pt', map_location=device)
+
+        X = to_explain_reg["inputs"]
+        gen_ids_one_hot = to_explain_reg["gen_id"]
+        pred_ids_one_hot = to_explain_reg["pred_id"]
+
+        gen_ids = gen_ids_one_hot.argmax(axis=1)
+        pred_ids = pred_ids_one_hot.argmax(axis=1)
+
+        # make directories to hold the heatmaps
+        for i in range(6):
+            if not osp.isdir(outpath + f'/class{str(i)}'):
+                os.makedirs(outpath + f'/class{str(i)}')
+            for j in range(6):
+                if not osp.isdir(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}'):
+                    os.makedirs(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}')
+
+        # attempt to break down big_list onto 6 smaller lists, 1 for each pid
+        list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
+        dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
+
+        for node_i in range(len(big_list_reg)):  # iterate over the nodes
+
+            if gen_ids[node_i]==0:  # if it's a null then add it to the null list
+                list0.append(big_list_reg[node_i])
+                dist0.append(node_i)
+            if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
+                list1.append(big_list_reg[node_i])
+                dist1.append(node_i)
+            if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
+                list2.append(big_list_reg[node_i])
+                dist2.append(node_i)
+            if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
+                list3.append(big_list_reg[node_i])
+                dist3.append(node_i)
+            if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
+                list4.append(big_list_reg[node_i])
+                dist4.append(node_i)
+            if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
+                list5.append(big_list_reg[node_i])
+                dist5.append(node_i)
+
+        list = [list0,list1,list2,list3,list4,list5]
+        dist = [dist0,dist1,dist2,dist3,dist4,dist5]
+
+        for pid in range(output_dim_id):
+
+            for node_i in range(len(list[pid])): # iterate over the nodes in each list
+
+                for p4_elem in range(output_dim_p4):
+                    R_cat_feat = torch.cat([list[pid][node_i][p4_elem].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
+
+                    non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
+                    R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
+                    pos = dist[pid][node_i]
+                    probability = pred_ids_one_hot[pos]
+
+                    def get_type(t):
+                        l = []
+                        for elem in t:
+                            if elem==1:
+                                l.append('cluster')
+                            if elem==2:
+                                l.append('track')
+                        return l
+
+                    node_types = get_type(R_cat_feat[:,12])
+
+                    fig, ax = plt.subplots()
+                    fig.tight_layout()
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(p4_elem)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(p4_elem)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                    ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
+                    if pid==1:
+                        features = ["type", " pt", "eta",
+                               "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
+                    else:
+                        features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
+
+                    ax.set_xticks(np.arange(len(features)))
+                    ax.set_yticks(np.arange(len(node_types)))
+                    for col in range(len(features)):
+                        for row in range(len(node_types)):
+                            text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
+                                           ha="center", va="center", color="w")
+                    # ... and label them with the respective list entries
+                    ax.set_xticklabels(features)
+                    ax.set_yticklabels(node_types)
+                    plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
+                    plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
+                    plt.colorbar()
+                    fig.set_size_inches(10, 10)
+                    plt.savefig(outpath + f'/class{str(pid)}'+f'/p4_elem{str(p4_elem)}'+f'/sample{str(node_i)}.jpg')
+                    plt.close(fig)
+
+    if args.make_heatmaps_clf:
+        print('Making heatmaps for regression..')
+
+        # load the necessary R-scores
+        big_list_clf = torch.load(outpath + f'/big_list_clf.pt', map_location=device)
+        to_explain_clf = torch.load(outpath + f'/to_explain_clf.pt', map_location=device)
+
+        X = to_explain_clf["inputs"]
+        gen_ids_one_hot = to_explain_clf["gen_id"]
+        pred_ids_one_hot = to_explain_clf["pred_id"]
+
+        gen_ids = gen_ids_one_hot.argmax(axis=1)
+        pred_ids = pred_ids_one_hot.argmax(axis=1)
+
+        # make directories to hold the heatmaps
+        for i in range(6):
+            if not osp.isdir(outpath + f'/clf_class{str(i)}'):
+                os.makedirs(outpath + f'/clfclass{str(i)}')
+            for j in range(6):
+                if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')
+
+        # attempt to break down big_list onto 6 smaller lists, 1 for each pid
+        list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
+        dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
+
+        for node_i in range(len(big_list_clf)):  # iterate over the nodes
+
+            if gen_ids[node_i]==0:  # if it's a null then add it to the null list
+                list0.append(big_list_clf[node_i])
+                dist0.append(node_i)
+            if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
+                list1.append(big_list_clf[node_i])
+                dist1.append(node_i)
+            if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
+                list2.append(big_list_clf[node_i])
+                dist2.append(node_i)
+            if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
+                list3.append(big_list_clf[node_i])
+                dist3.append(node_i)
+            if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
+                list4.append(big_list_clf[node_i])
+                dist4.append(node_i)
+            if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
+                list5.append(big_list_clf[node_i])
+                dist5.append(node_i)
+
+        list = [list0,list1,list2,list3,list4,list5]
+        dist = [dist0,dist1,dist2,dist3,dist4,dist5]
+
+        for pid in range(output_dim_id):
+
+            for node_i in range(len(list[pid])): # iterate over the nodes in each list
+
+                for output_neuron in range(output_dim_id):
+                    R_cat_feat = torch.cat([list[pid][node_i][output_neuron].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
+
+                    non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
+                    R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
+                    pos = dist[pid][node_i]
+                    probability = pred_ids_one_hot[pos]
+
+                    def get_type(t):
+                        l = []
+                        for elem in t:
+                            if elem==1:
+                                l.append('cluster')
+                            if elem==2:
+                                l.append('track')
+                        return l
+
+                    node_types = get_type(R_cat_feat[:,12])
+
+                    fig, ax = plt.subplots()
+                    fig.tight_layout()
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                    ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
+                    if pid==1:
+                        features = ["type", " pt", "eta",
+                               "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
+                    else:
+                        features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
+
+                    ax.set_xticks(np.arange(len(features)))
+                    ax.set_yticks(np.arange(len(node_types)))
+                    for col in range(len(features)):
+                        for row in range(len(node_types)):
+                            text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
+                                           ha="center", va="center", color="w")
+                    # ... and label them with the respective list entries
+                    ax.set_xticklabels(features)
+                    ax.set_yticklabels(node_types)
+                    plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
+                    plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
+                    plt.colorbar()
+                    fig.set_size_inches(10, 10)
+                    plt.savefig(outpath + f'/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                    plt.close(fig)
+
+# # ------------------------------------------------------------------------------------------------
+# # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:
+# print(R16[0].sum(axis=1)[0])
+# print(R15[0].sum(axis=1)[0])
+# print(R14[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R12[0].sum(axis=1)[0])
+# print(R11[0].sum(axis=1)[0])
+# print(R10[0].sum(axis=1)[0])
+# print(R9[0].sum(axis=1)[0])
+# print(R8[0].sum(axis=1)[0])
+# print(R_score_layer_before_msg_passing[0][0].sum(axis=0).sum())
+# print(R7[0][0].sum(axis=0).sum())
+# print(R6[0][0].sum(axis=1).sum())
+# print(R5[0][0].sum(axis=1).sum())
+# print(R4[0][0].sum(axis=1).sum())
+# print(R3[0][0].sum(axis=1).sum())
+# print(R2[0][0].sum(axis=1).sum())
+# print(R1[0][0].sum(axis=1).sum())
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_clf.py b/mlpf/pytorch_delphes/LRP/model_LRP_clf.py
new file mode 100644
index 000000000..997ba8d65
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/model_LRP_clf.py
@@ -0,0 +1,98 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
+from gravnet_LRP import GravNetConv
+from torch_geometric.nn import GraphConv
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16):
+
+        super(PFNet7, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.elu = nn.ELU
+
+        # (1) DNN
+        self.nn1 = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, input_encoding),
+        )
+
+        # (2) CNN: Gravnet layer
+        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+    def forward(self, data):
+
+        x = data.x
+
+        # Encoder/Decoder step
+        x = self.nn1(x)
+
+        # Gravnet step
+        x, edge_index, edge_weight, after_message, before_message = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID (after a dropout)
+        pred_ids = self.nn2(x)
+
+        pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand,  edge_index, edge_weight, after_message, before_message
+# -------------------------------------------------------------------------------------
+# # uncomment to test a forward pass
+# from graph_data_delphes import PFGraphDataset
+# from data_preprocessing import data_to_loader_ttbar
+# from data_preprocessing import data_to_loader_qcd
+#
+# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
+#
+# print('Input to the network:', next(iter(train_loader)))
+#
+# model = PFNet7()
+#
+# for batch in train_loader:
+#     pred_ids, pred_p4, target_ids, target_p4 = model(batch)
+#     pred_ids
+#     print('Predicted PID:', pred_ids)
+#     print('Predicted p4:', pred_p4)
+#     break
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_dnn.py b/mlpf/pytorch_delphes/LRP/model_LRP_dnn.py
new file mode 100644
index 000000000..f95cea3e2
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/model_LRP_dnn.py
@@ -0,0 +1,66 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256,
+        output_dim_id=6,
+        output_dim_p4=6):
+
+        super(PFNet7, self).__init__()
+
+        self.elu = nn.ELU
+
+        # (1) DNN
+        self.nn1 = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id)
+            )
+
+    def forward(self, data):
+
+        x0 = data.x
+
+        pred_ids = self.nn1(x0)
+
+        pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
+
+# -------------------------------------------------------------------------------------
+# # uncomment to test a forward pass
+# from graph_data_delphes import PFGraphDataset
+# from data_preprocessing import data_to_loader_ttbar
+# from data_preprocessing import data_to_loader_qcd
+#
+# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
+#
+# print('Input to the network:', next(iter(train_loader)))
+#
+# model = PFNet7()
+#
+# for batch in train_loader:
+#     pred_ids, pred_p4, target_ids, target_p4, cand_ids, cand_p4 = model(batch)
+#     break
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_reg.py b/mlpf/pytorch_delphes/LRP/model_LRP_reg.py
new file mode 100644
index 000000000..c1140daad
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/model_LRP_reg.py
@@ -0,0 +1,109 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
+from gravnet_LRP import GravNetConv
+from torch_geometric.nn import GraphConv
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16):
+
+        super(PFNet7, self).__init__()
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.elu = nn.ELU
+
+        # (1) DNN
+        self.nn1 = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, input_encoding),
+        )
+
+        # (2) CNN: Gravnet layer
+        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+        # (4) DNN layer: regressing p4
+        self.nn3 = nn.Sequential(
+            nn.Linear(encoding_dim + output_dim_id + input_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_p4),
+        )
+
+
+    def forward(self, data):
+
+        x0 = data.x
+
+        # Encoder/Decoder step
+        x = self.nn1(x0)
+
+        # Gravnet step
+        x, edge_index, edge_weight, after_message, before_message = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+        pred_p4 = self.nn3(nn3_input)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand,  edge_index, edge_weight, after_message, before_message
+# -------------------------------------------------------------------------------------
+# # uncomment to test a forward pass
+# from graph_data_delphes import PFGraphDataset
+# from data_preprocessing import data_to_loader_ttbar
+# from data_preprocessing import data_to_loader_qcd
+#
+# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
+#
+# print('Input to the network:', next(iter(train_loader)))
+#
+# model = PFNet7()
+#
+# for batch in train_loader:
+#     pred_ids, pred_p4, target_ids, target_p4 = model(batch)
+#     pred_ids
+#     print('Predicted PID:', pred_ids)
+#     print('Predicted p4:', pred_p4)
+#     break
diff --git a/mlpf/pytorch_delphes/LRP/model_io.py b/mlpf/pytorch_delphes/LRP/model_io.py
new file mode 100644
index 000000000..c3b2cf15b
--- /dev/null
+++ b/mlpf/pytorch_delphes/LRP/model_io.py
@@ -0,0 +1,156 @@
+import torch
+import torch.nn as nn
+from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
+from torch_scatter import scatter_mean
+import numpy as np
+import json
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+class model_io:
+    SPECIAL_LAYERS=[
+        ".nn2.0",
+        # ".conv1.lin_h",
+        # ".conv1.lin_p"
+    ]
+
+    def __init__(self,model,
+                model_state_dict,
+                activation_dest, dic):
+
+        self.model=model
+        self.model.load_state_dict(model_state_dict)
+        self.dest=activation_dest
+        self.dic=dic
+
+        # declare variables
+        self.L=dict()           # layers
+        self.A=activation_dest  # activations
+        # self.R=dict()          # relevance scores
+
+        self._rules=dict()     # rules to use for each layer
+        self._hook_handles=[]  # collection of all hook handles
+
+        # # extract layers and register hooks
+        # self._extract_layers("",model,)
+
+        self.L=dict()
+        for name, module in model.named_modules():
+            # print(name)
+            if name=='conv1' or name=='conv2':
+                self.L[name]=module
+            else:
+                self.L['.'+name]=module
+
+        for key, value in list(self.L.items()):
+            if key not in self.dic.keys():
+                del self.L[key]
+
+        self.n_layers=len(self.L.keys())
+
+        # register rules for each layer
+        self._register_rules()
+
+    """
+    rules functions
+    """
+    def _register_rules(self):
+        for layer_name in self.L.keys():
+            layer=self.L[layer_name]
+            layer_class=layer.__class__.__name__
+            if layer_class=="BatchNorm1d":
+                rule="z"
+            else:
+                rule="eps"
+            self._rules[layer_name]=rule
+
+    def get_rule(self,index=None,layer_name=None):
+        assert (not index is None) or (not layer_name is None), "at least one of (index,name) must be provided"
+        if layer_name is None:
+            layer_name=self.index2name(index)
+
+        if hasattr(self,"_rules"):
+            return self._rules[layer_name]
+        else:
+            self._register_rules()
+            return self._rules[layer_name]
+            
+    """
+    layer functions
+    """
+
+    def _extract_layers(self,name,model):
+        l=list(model.named_children())
+
+        if len(l)==0:
+            self.L[name]=copy_layer(model)
+        else:
+            l=list(model.named_children())
+            for i in l:
+                self._extract_layers(name+"."+i[0],i[1])
+
+    def get_layer(self,index=None,name=None):
+        assert (not index is None) or (not name is None), "at least one of (index,name) must be provided"
+        if name is None:
+            name=self.index2name(index)
+        return self.L[name]
+
+    """
+    general getters
+    """
+    def index2name(self,idx:int)->str:
+        if not hasattr(self,"_i2n"):
+            self._i2n=[]
+            for i,n in enumerate(self.A.keys()):
+                self._i2n.append(n)
+        return self._i2n[idx-2]
+
+    def name2index(self,name:str)->int:
+        if not hasattr(self,"_i2n"):
+            self._i2n=[]
+            for i,n in enumerate(self.A.keys()):
+                self._i2n.append(n)
+        return self._i2n.index(name)
+
+    """
+    reset and setter functions
+    """
+    def _clear_hooks(self):
+        for hook in self._hook_handles:
+            hook.remove()
+
+    def reset(self):
+        """
+        reset the prepared model
+        """
+        pass
+        # self._clear_hooks()
+        # self.A=dict()
+        # self.R=dict()
+
+    def set_dest(self,activation_dest):
+        self.A=activation_dest
+
+def copy_layer(layer):
+    """
+    create a deep copy of provided layer
+    """
+    layer_cp=eval("nn."+layer.__repr__())
+    layer_cp.load_state_dict(layer.state_dict())
+
+    return layer_cp.to(device)
+
+def copy_tensor(tensor,dtype=torch.float32):
+    """
+    create a deep copy of the provided tensor,
+    outputs the copy with specified dtype
+    """
+
+    return tensor.clone().detach().requires_grad_(True).to(device)
diff --git a/mlpf/pytorch_delphes/README.md b/mlpf/pytorch_delphes/README.md
new file mode 100644
index 000000000..3a25d6f4b
--- /dev/null
+++ b/mlpf/pytorch_delphes/README.md
@@ -0,0 +1,23 @@
+Short instructions to do a quick training on delphes data:
+```bash
+cd ../..
+./scripts/local_test_delphes_pytorch.sh
+```
+
+### Delphes dataset
+The dataset is available from zenodo: https://doi.org/10.5281/zenodo.4452283.
+
+Instructions to download and process the full Delphes dataset:
+```bash
+cd ../../scripts/
+./get_all_data_delphes.sh
+```
+
+This script will download and process the data under a directory called "test_tmp_delphes/" in particleflow. There are will be two subdirectories under test_tmp_delphes/ (1) data/: which contains the data (2) experiments/: which will contain any trained model
+
+
+Instructions to explain using LRP (you must have an already trained model in test_tmp_delphes/experiments):
+```bash
+cd LRP/
+python -u main_reg.py --LRP_load_model=<your_model> --LRP_load_epoch=<your_epoch>
+```
diff --git a/mlpf/pytorch_delphes/args.py b/mlpf/pytorch_delphes/args.py
new file mode 100644
index 000000000..ecfadd151
--- /dev/null
+++ b/mlpf/pytorch_delphes/args.py
@@ -0,0 +1,129 @@
+import argparse
+from math import inf
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # from raw -> processed
+    parser.add_argument("--dataset", type=str, default='../../../test_tmp_delphes/data/pythia8_ttbar', help="dataset path", required=False)
+    parser.add_argument("--dataset_qcd", type=str, default='../../../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=False)
+    parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
+    parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
+    parser.add_argument("--num-proc", type=int, default=24, help="number of processes")
+
+    # for training
+    parser.add_argument("--train", action=BoolArg, default=True, help="Trains the model")
+    parser.add_argument("--n_train", type=int, default=3, help="number of data files to use for training.. each file contains 100 events")
+    parser.add_argument("--n_valid", type=int, default=1, help="number of data files to use for validation.. each file contains 100 events")
+    parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing.. each file contains 100 events")
+    parser.add_argument("--n_epochs", type=int, default=1, help="number of training epochs")
+    parser.add_argument("--patience", type=int, default=100, help="patience before early stopping")
+    parser.add_argument("--hidden_dim", type=int, default=256, help="hidden dimension")
+    parser.add_argument("--hidden_dim_nn1", type=int, default=64, help="hidden dimension")
+    parser.add_argument("--input_encoding", type=int, default=12, help="use an input encoding layer")
+    parser.add_argument("--encoding_dim", type=int, default=64, help="encoded element dimension")
+    parser.add_argument("--embedding_dim", type=int, default=0, help="embedding dimension of the type feature (prefered equal to be 3)")
+    parser.add_argument("--encoding_of_clusters", action=BoolArg, default=False, help="Trains an MLP to encode clusters")
+    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
+    parser.add_argument("--model", type=str, help="type of model to use", default="PFNet7")
+    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="gen")
+    parser.add_argument("--outpath", type=str, default = '../../../test_tmp_delphes/experiments/', help="Output folder")
+    parser.add_argument("--optimizer", type=str, default='adam', choices=["adam", "adamw"], help="optimizer to use")
+    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+    parser.add_argument("--alpha", type=float, default=2e-4, help="Loss multiplier for pdg-id classification.. recall: loss = clf + alpha*reg")
+    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout rate")
+    parser.add_argument("--space_dim", type=int, default=4, help="Spatial dimension for clustering in gravnet layer")
+    parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
+    parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
+    parser.add_argument("--overwrite", action=BoolArg, default=False, help="Overwrites the model if True")
+    parser.add_argument("--load", action=BoolArg, default=False, help="Load the model (no training)")
+    parser.add_argument("--load_model", type=str, help="Which model to load", default="/PFNet7_cand_ntrain_2")
+    parser.add_argument("--load_epoch", type=int, default=0, help="Which epoch of the model to load for evaluation")
+    parser.add_argument("--classification_only", action=BoolArg, default=False, help="Check to train for classification only (no regression)")
+    parser.add_argument("--regression_only", action=BoolArg, default=False, help="Check to train for regression only (no classification)")
+    parser.add_argument("--nn1", action=BoolArg, default=True, help="Adds an encoder/decoder step before gravnet..")
+    parser.add_argument("--nn3", action=BoolArg, default=True, help="Adds the network to regress p4..")
+    parser.add_argument("--nn4", action=BoolArg, default=True, help="Adds an extra network for the dnn model..")
+    parser.add_argument("--nn0track", action=BoolArg, default=False, help="Adds an initial network that encode the tracks..")
+    parser.add_argument("--nn0cluster", action=BoolArg, default=False, help="Adds an initial network that encode the clusters..")
+    parser.add_argument("--title", type=str, default='', help="Appends this title to the model's name")
+
+    # for evaluation: making predictions & making plots
+    parser.add_argument("--make_predictions_train", action=BoolArg, default=False, help="make predictions on training data..")
+    parser.add_argument("--make_predictions_valid", action=BoolArg, default=False, help="make predictions on validation data..")
+    parser.add_argument("--make_predictions_test", action=BoolArg, default=True, help="make predictions on testing data..")
+    parser.add_argument("--make_plots_train", action=BoolArg, default=False, help="make plots on training data..")
+    parser.add_argument("--make_plots_valid", action=BoolArg, default=False, help="make plots on validation data..")
+    parser.add_argument("--make_plots_test", action=BoolArg, default=True, help="make plots on testing data..")
+
+    # for LRP
+    parser.add_argument("--explain", action=BoolArg, default=True, help="General setup mode: if True then you want to explain.. if False then you will load an already explained model (already made R-scores)..")
+    parser.add_argument("--LRP_load_model", type=str, default="/PFNet7_cand_ntrain_2", help="Loads the model to explain", required=False)
+    parser.add_argument("--LRP_load_epoch", type=int, default=0, help="Loads the epoch after which to explain")
+    parser.add_argument("--LRP_reg", action=BoolArg, default=True, help="Runs LRP for interpreting the regression part..")
+    parser.add_argument("--make_heatmaps_reg", action=BoolArg, default=True, help="Constructs heatmaps for the regressed p4 (must run with explain=True or else you load a pre-explained model with explain=False)..")
+    parser.add_argument("--LRP_clf", action=BoolArg, default=True, help="Runs LRP for interpreting the classification part..")
+    parser.add_argument("--make_heatmaps_clf", action=BoolArg, default=True, help="Constructs heatmaps for the classified pid (must run with explain=True or else you load a pre-explained model with explain=False)..")
+    parser.add_argument("--LRP_outpath", type=str, default = '../../../../test_tmp_delphes/experiments/', help="Output folder")
+    parser.add_argument("--LRP_dataset", type=str, default='../../../../test_tmp_delphes/data/pythia8_ttbar', help="dataset path", required=False)
+    parser.add_argument("--LRP_dataset_qcd", type=str, default='../../../../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=False)
+
+    args = parser.parse_args()
+
+    return args
+
+
+class BoolArg(argparse.Action):
+    """
+    Take an argparse argument that is either a boolean or a string and return a boolean.
+    """
+    def __init__(self, default=None, nargs=None, *args, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+
+        # Set default
+        if default is None:
+            raise ValueError("Default must be set!")
+
+        default = _arg_to_bool(default)
+
+        super().__init__(*args, default=default, nargs='?', **kwargs)
+
+    def __call__(self, parser, namespace, argstring, option_string):
+
+        if argstring is not None:
+            # If called with an argument, convert to bool
+            argval = _arg_to_bool(argstring)
+        else:
+            # BoolArg will invert default option
+            argval = True
+
+        setattr(namespace, self.dest, argval)
+
+def _arg_to_bool(arg):
+    # Convert argument to boolean
+
+    if type(arg) is bool:
+        # If argument is bool, just return it
+        return arg
+
+    elif type(arg) is str:
+        # If string, convert to true/false
+        arg = arg.lower()
+        if arg in ['true', 't', '1']:
+            return True
+        elif arg in ['false', 'f', '0']:
+            return False
+        else:
+            return ValueError('Could not parse a True/False boolean')
+    else:
+        raise ValueError('Input must be boolean or string! {}'.format(type(arg)))
+
+
+# From https://stackoverflow.com/questions/12116685/how-can-i-require-my-python-scripts-argument-to-be-a-float-between-0-0-1-0-usin
+class Range(object):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+    def __eq__(self, other):
+        return self.start <= other <= self.end
diff --git a/mlpf/pytorch_delphes/data_preprocessing.py b/mlpf/pytorch_delphes/data_preprocessing.py
new file mode 100644
index 000000000..889ae7285
--- /dev/null
+++ b/mlpf/pytorch_delphes/data_preprocessing.py
@@ -0,0 +1,94 @@
+import numpy as np
+import torch
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+# if not multigpu we have to pass batches that are stacked as "batch.type() = Batch" (not list) so that pytorch can access attributes like ygen_id through batch.ygen_id
+# if multigpu we have to pass list of "Data" elements.. then behind the scene, pytorch DP will convert the list to appropriate Batches to fit on the gpus available so that batch.ygen_id works out of the box
+
+# define a function that casts the ttbar dataset into a dataloader for efficient NN training
+def data_to_loader_ttbar(full_dataset, n_train, n_valid, batch_size):
+
+    # https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html
+    train_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_train))
+    valid_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=n_train, stop=n_train+n_valid))
+
+    # preprocessing the train_dataset in a good format for passing correct batches of events to the GNN
+    train_data=[]
+    for i in range(len(train_dataset)):
+        train_data = train_data + train_dataset[i]
+
+    # preprocessing the valid_dataset in a good format for passing correct batches of events to the GNN
+    valid_data=[]
+    for i in range(len(valid_dataset)):
+        valid_data = valid_data + valid_dataset[i]
+
+    if not multi_gpu:
+        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
+        valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
+    else:
+        #https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/data_parallel.html
+        train_loader = DataListLoader(train_data, batch_size=batch_size, shuffle=True)
+        valid_loader = DataListLoader(valid_data, batch_size=batch_size, shuffle=True)
+
+    return train_loader, valid_loader
+
+def data_to_loader_qcd(full_dataset, n_test, batch_size):
+
+    test_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_test))
+
+    # preprocessing the test_dataset in a good format for passing correct batches of events to the GNN
+    test_data=[]
+    for i in range(len(test_dataset)):
+        test_data = test_data + test_dataset[i]
+
+    if not multi_gpu:
+        test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
+    else:
+        #https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/data_parallel.html
+        test_loader = DataListLoader(test_data, batch_size=batch_size, shuffle=True)
+
+    return test_loader
+
+#----------------------------------------------------------------------------------------
+# from graph_data_delphes import PFGraphDataset, one_hot_embedding
+# # the next part initializes some args values (to run the script not from terminal)
+# class objectview(object):
+#     def __init__(self, d):
+#         self.__dict__ = d
+#
+# args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 1, 'patience': 100, 'hidden_dim':32, 'encoding_dim': 256,
+# 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
+# 'outpath': '../../test_tmp_delphes/experiments/', 'activation': 'leaky_relu', 'optimizer': 'adam', 'lr': 1e-4, 'l1': 1, 'l2': 0.001, 'l3': 1, 'dropout': 0.5,
+# 'radius': 0.1, 'convlayer': 'gravnet-knn', 'convlayer2': 'none', 'space_dim': 2, 'nearest': 3, 'overwrite': True,
+# 'input_encoding': 0, 'load': False, 'load_epoch': 0, 'load_model': 'PFNet7_cand_ntrain_3_nepochs_1', 'evaluate': True, 'evaluate_on_cpu': True})
+#
+# full_dataset = PFGraphDataset(args.dataset)
+# full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, args.n_train, args.n_valid, batch_size=args.batch_size)
+# test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+#
+# for batch in train_loader:
+#     break
+#
+# batch
+# len(train_loader)
+#
+#
+# # if multigpu: a "Batch" of size 3 is given by: [Data(x=[5k, 12], ycand=[5k, 6], ...) , Data(x=[5k, 12], ...), Data(x=[5k, 12], ...)]
+# # then when we pass it to the model, DP takes care of converting it into batches like this (for 2 gpus):
+# # Batch(batch=[2*5k], x=[2*5k, 12], ...)
+# # Batch(batch=[5k], x=[5k, 12], ...)
+#
+# # if not multigpu: a "Batch" of size 2 is directly given by: Batch(batch=(2*5k), x=(2*5k,12), ...)
+# # Note: batch is a column vector which maps each node to its respective graph in the batch:
+# batch.batch
diff --git a/mlpf/pytorch_delphes/evaluate.py b/mlpf/pytorch_delphes/evaluate.py
new file mode 100644
index 000000000..1b0ba25ef
--- /dev/null
+++ b/mlpf/pytorch_delphes/evaluate.py
@@ -0,0 +1,343 @@
+import args
+from args import parse_args
+import sklearn
+import sklearn.metrics
+import numpy as np
+import pandas, mplhep
+import pickle as pkl
+import time, math
+
+import sys
+import os.path as osp
+sys.path.insert(1, '../../plotting/')
+sys.path.insert(1, '../../mlpf/plotting/')
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+import matplotlib
+import matplotlib.pyplot as plt
+import mpl_toolkits
+import mplhep as hep
+plt.style.use(hep.style.ROOT)
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+from plot_utils import plot_confusion_matrix
+from plots import plot_regression, plot_distributions_pid, plot_distributions_all, plot_pt_eta, plot_num_particles_pid, draw_efficiency_fakerate, get_eff, get_fake, plot_reso
+
+
+def make_predictions(model, test_loader, outpath, target, device, epoch, which_data):
+
+    print('Making predictions on ' + which_data)
+    t0=time.time()
+
+    gen_list = {"null":[], "chhadron":[], "nhadron":[], "photon":[], "electron":[], "muon":[]}
+    pred_list = {"null":[], "chhadron":[], "nhadron":[], "photon":[], "electron":[], "muon":[]}
+    cand_list = {"null":[], "chhadron":[], "nhadron":[], "photon":[], "electron":[], "muon":[]}
+
+    for i, batch in enumerate(test_loader):
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot.detach(), -1)
+        _, pred_ids = torch.max(pred_ids_one_hot.detach(), -1)
+        _, cand_ids = torch.max(cand_ids_one_hot.detach(), -1)
+
+        # to make "num_gen vs num_pred" plots
+        gen_list["null"].append((gen_ids==0).sum().item())
+        gen_list["chhadron"].append((gen_ids==1).sum().item())
+        gen_list["nhadron"].append((gen_ids==2).sum().item())
+        gen_list["photon"].append((gen_ids==3).sum().item())
+        gen_list["electron"].append((gen_ids==4).sum().item())
+        gen_list["muon"].append((gen_ids==5).sum().item())
+
+        pred_list["null"].append((pred_ids==0).sum().item())
+        pred_list["chhadron"].append((pred_ids==1).sum().item())
+        pred_list["nhadron"].append((pred_ids==2).sum().item())
+        pred_list["photon"].append((pred_ids==3).sum().item())
+        pred_list["electron"].append((pred_ids==4).sum().item())
+        pred_list["muon"].append((pred_ids==5).sum().item())
+
+        cand_list["null"].append((cand_ids==0).sum().item())
+        cand_list["chhadron"].append((cand_ids==1).sum().item())
+        cand_list["nhadron"].append((cand_ids==2).sum().item())
+        cand_list["photon"].append((cand_ids==3).sum().item())
+        cand_list["electron"].append((cand_ids==4).sum().item())
+        cand_list["muon"].append((cand_ids==5).sum().item())
+
+        gen_p4 = gen_p4.detach()
+        pred_p4 = pred_p4.detach()
+        cand_p4 = cand_p4.detach()
+
+        if i==0:
+            gen_ids_all = gen_ids
+            gen_p4_all = gen_p4
+
+            pred_ids_all = pred_ids
+            pred_p4_all = pred_p4
+
+            cand_ids_all = cand_ids
+            cand_p4_all = cand_p4
+        else:
+            gen_ids_all = torch.cat([gen_ids_all,gen_ids])
+            gen_p4_all = torch.cat([gen_p4_all,gen_p4])
+
+            pred_ids_all = torch.cat([pred_ids_all,pred_ids])
+            pred_p4_all = torch.cat([pred_p4_all,pred_p4])
+
+            cand_ids_all = torch.cat([cand_ids_all,cand_ids])
+            cand_p4_all = torch.cat([cand_p4_all,cand_p4])
+
+        if len(test_loader)<5000:
+            print(f'event #: {i+1}/{len(test_loader)}')
+        else:
+            print(f'event #: {i+1}/{5000}')
+
+        if i==4999:
+            break
+
+    t1=time.time()
+
+    print('Time taken to make predictions is:', round(((t1-t0)/60),2), 'min')
+
+    # store the 3 list dictionaries in a list (this is done only to compute the particle multiplicity plots)
+    list = [pred_list, gen_list, cand_list]
+
+    torch.save(list, outpath + '/list_for_multiplicities.pt')
+
+    torch.save(gen_ids_all, outpath + '/gen_ids.pt')
+    torch.save(gen_p4_all, outpath + '/gen_p4.pt')
+    torch.save(pred_ids_all, outpath + '/pred_ids.pt')
+    torch.save(pred_p4_all, outpath + '/pred_p4.pt')
+    torch.save(cand_ids_all, outpath + '/cand_ids.pt')
+    torch.save(cand_p4_all, outpath + '/cand_p4.pt')
+
+    ygen = torch.cat([gen_ids_all.reshape(-1,1).float(),gen_p4_all], axis=1)
+    ypred = torch.cat([pred_ids_all.reshape(-1,1).float(),pred_p4_all], axis=1)
+    ycand = torch.cat([cand_ids_all.reshape(-1,1).float(),cand_p4_all], axis=1)
+
+    # store the actual predictions to make all the other plots
+    predictions = {"ygen":ygen.reshape(1,-1,7).detach().cpu().numpy(), "ycand":ycand.reshape(1,-1,7).detach().cpu().numpy(), "ypred":ypred.detach().reshape(1,-1,7).cpu().numpy()}
+
+    torch.save(predictions, outpath + '/predictions.pt')
+
+
+def make_plots(model, test_loader, outpath, target, device, epoch, which_data):
+
+    print('Making plots on ' + which_data)
+    t0=time.time()
+
+    # load the necessary predictions to make the plots
+    gen_ids = torch.load(outpath + f'/gen_ids.pt', map_location=device)
+    gen_p4 = torch.load(outpath + f'/gen_p4.pt', map_location=device)
+    pred_ids = torch.load(outpath + f'/pred_ids.pt', map_location=device)
+    pred_p4 = torch.load(outpath + f'/pred_p4.pt', map_location=device)
+    cand_ids = torch.load(outpath + f'/cand_ids.pt', map_location=device)
+    cand_p4 = torch.load(outpath + f'/cand_p4.pt', map_location=device)
+
+    list_for_multiplicities = torch.load(outpath + f'/list_for_multiplicities.pt', map_location=device)
+
+    predictions = torch.load(outpath + f'/predictions.pt', map_location=device)
+
+    # reformat a bit
+    ygen = predictions["ygen"].reshape(-1,7)
+    ypred = predictions["ypred"].reshape(-1,7)
+    ycand = predictions["ycand"].reshape(-1,7)
+
+    # make confusion matrix for MLPF
+    conf_matrix_mlpf = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
+                                    pred_ids.cpu(), labels=range(6), normalize="true")
+
+    plot_confusion_matrix(conf_matrix_mlpf, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_mlpf' + str(epoch), epoch=epoch)
+    torch.save(conf_matrix_mlpf, outpath + '/conf_matrix_mlpf' + str(epoch) + '.pt')
+
+    # make confusion matrix for rule based PF
+    conf_matrix_cand = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
+                                    cand_ids.cpu(), labels=range(6), normalize="true")
+
+    plot_confusion_matrix(conf_matrix_cand, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_cand' + str(epoch), epoch=epoch)
+    torch.save(conf_matrix_cand, outpath + '/conf_matrix_cand' + str(epoch) + '.pt')
+
+    # making all the other plots
+    sample_title_qcd = "QCD, 14 TeV, PU200"
+    sample_title_ttbar = "$t\\bar{t}$, 14 TeV, PU200"
+
+    # make distribution plots
+    plot_distributions_pid(1, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for chhadrons
+                target, epoch, outpath)
+    plot_distributions_pid(2, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for nhadrons
+                target, epoch, outpath)
+    plot_distributions_pid(3, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for photons
+                target, epoch, outpath)
+    plot_distributions_pid(4, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for electrons
+                target, epoch, outpath)
+    plot_distributions_pid(5, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for muons
+                target, epoch, outpath)
+
+    plot_distributions_all(gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for all together
+                target, epoch, outpath)
+
+    # make pt, eta plots to visualize dataset
+    ax, _ = plot_pt_eta(ygen)
+    plt.savefig(outpath+"/gen_pt_eta.png", bbox_inches="tight")
+
+    # plot particle multiplicity plots
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_null = plot_num_particles_pid(list_for_multiplicities, "null", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_null.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_chhad = plot_num_particles_pid(list_for_multiplicities, "chhadron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_chhadron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_nhad = plot_num_particles_pid(list_for_multiplicities, "nhadron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_nhadron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_photon = plot_num_particles_pid(list_for_multiplicities, "photon", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_photon.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_electron = plot_num_particles_pid(list_for_multiplicities, "electron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_electron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_muon = plot_num_particles_pid(list_for_multiplicities, "muon", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_muon.png", bbox_inches="tight")
+    plt.close(fig)
+
+    # make efficiency and fake rate plots for charged hadrons
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_pt.png", both=True, legend_title=sample_title_qcd+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_eta.png", both=True, legend_title=sample_title_qcd+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid1_energy.png", both=True, legend_title=sample_title_qcd+"\n")
+
+    # make efficiency and fake rate plots for neutral hadrons
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_pt.png", both=True, legend_title=sample_title_qcd+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_eta.png", both=True, legend_title=sample_title_qcd+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid2_energy.png", both=True, legend_title=sample_title_qcd+"\n")
+
+    # make resolution plots for chhadrons: pid=1
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_pt = plot_reso(ygen, ypred, ycand, 1, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_eta = plot_reso(ygen, ypred, ycand, 1, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_E = plot_reso(ygen, ypred, ycand, 1, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for nhadrons: pid=2
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_pt = plot_reso(ygen, ypred, ycand, 2, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_eta = plot_reso(ygen, ypred, ycand, 2, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_E = plot_reso(ygen, ypred, ycand, 2, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for photons: pid=3
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_pt = plot_reso(ygen, ypred, ycand, 3, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_eta = plot_reso(ygen, ypred, ycand, 3, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_E = plot_reso(ygen, ypred, ycand, 3, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for electrons: pid=4
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_pt = plot_reso(ygen, ypred, ycand, 4, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_eta = plot_reso(ygen, ypred, ycand, 4, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_E = plot_reso(ygen, ypred, ycand, 4, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for muons: pid=5
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_pt = plot_reso(ygen, ypred, ycand, 5, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_eta = plot_reso(ygen, ypred, ycand, 5, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_E = plot_reso(ygen, ypred, ycand, 5, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    t1=time.time()
+    print('Time taken to make plots is:', round(((t1-t0)/60),2), 'min')
diff --git a/mlpf/pytorch/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
similarity index 80%
rename from mlpf/pytorch/graph_data_delphes.py
rename to mlpf/pytorch_delphes/graph_data_delphes.py
index e7110038b..68dd5ced5 100644
--- a/mlpf/pytorch/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -7,8 +7,8 @@
 from torch_geometric.data import Dataset, Data, Batch
 import itertools
 from glob import glob
+import numba
 from numpy.lib.recfunctions import append_fields
-import bz2
 
 import pickle
 import scipy
@@ -16,6 +16,8 @@
 import math
 import multiprocessing
 
+import args
+from args import parse_args
 # assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
 # they are processed and saved as pt files in /test_tmp_delphes/data/pythia8_ttbar/processed
 # PFGraphDataset -> returns for 1 event: Data(x=[5139, 12], ycand=[5139, 6], ycand_id=[5139, 6], ygen=[5139, 6], ygen_id=[5139, 6])
@@ -53,8 +55,7 @@ def __init__(self, root, transform=None, pre_transform=None):
 
     @property
     def raw_file_names(self):
-        raw_list = list(glob(osp.join(self.raw_dir, '*.pkl')))
-        raw_list += list(glob(osp.join(self.raw_dir, '*.pkl.bz2')))
+        raw_list = glob(osp.join(self.raw_dir, '*.pkl'))
         print("PFGraphDataset nfiles={}".format(len(raw_list)))
         return sorted([l.replace(self.raw_dir, '.') for l in raw_list])
 
@@ -81,13 +82,8 @@ def download(self):
         pass
 
     def process_single_file(self, raw_file_name):
-        if raw_file_name.endswith(".pkl"):
-            with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi:
-                data = pickle.load(fi, encoding='iso-8859-1')
-        elif raw_file_name.endswith(".pkl.bz2"):
-            data = pickle.load(bz2.BZ2File(osp.join(self.raw_dir, raw_file_name), "rb"), encoding='iso-8859-1')
-        else:
-            raise Exception("Unknown file format")
+        with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi:
+            data = pickle.load(fi, encoding='iso-8859-1')
 
         x=[]
         ygen=[]
@@ -153,21 +149,13 @@ def __getitem__(self, idx):
         return self.get(idx)
 
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", type=str, help="dataset path", required=True)
-    parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
-    parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
-    parser.add_argument("--num-proc", type=int, default=24, help="number of processes")
-    args = parser.parse_args()
+
+    args = parse_args()
 
     pfgraphdataset = PFGraphDataset(root=args.dataset)
 
     if args.processed_dir:
         pfgraphdataset._processed_dir = args.processed_dir
-    
-    if not os.path.isdir(pfgraphdataset._processed_dir):
-        os.makedirs(pfgraphdataset._processed_dir)
 
     pfgraphdataset.process_parallel(args.num_files_merge,args.num_proc)
     #pfgraphdataset.process(args.num_files_merge)
diff --git a/mlpf/pytorch_delphes/gravnet.py b/mlpf/pytorch_delphes/gravnet.py
new file mode 100644
index 000000000..5b26d3954
--- /dev/null
+++ b/mlpf/pytorch_delphes/gravnet.py
@@ -0,0 +1,122 @@
+from typing import Optional, Union
+from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
+import time
+
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+
+try:
+    from torch_cluster import knn
+except ImportError:
+    knn = None
+
+# copied it from pytorch_geometric source code
+# ADDED: retrieve edge_index, retrieve edge_weight
+# CHANGED: self.lin -> self.lin_p
+# CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
+# REMOVED: skip connection
+
+class GravNetConv(MessagePassing):
+    r"""The GravNet operator from the `"Learning Representations of Irregular
+    Particle-detector Geometry with Distance-weighted Graph
+    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
+    dynamically constructed using nearest neighbors.
+    The neighbors are constructed in a learnable low-dimensional projection of
+    the feature space.
+    A second projection of the input feature space is then propagated from the
+    neighbors to each vertex using distance weights that are derived by
+    applying a Gaussian function to the distances.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        space_dimensions (int): The dimensionality of the space used to
+           construct the neighbors; referred to as :math:`S` in the paper.
+        propagate_dimensions (int): The number of features to be propagated
+           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
+           paper.
+        k (int): The number of nearest neighbors.
+        num_workers (int): Number of workers to use for k-NN computation.
+            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
+            lies on the GPU. (default: :obj:`1`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 space_dimensions: int, propagate_dimensions: int, k: int,
+                 num_workers: int = 1, **kwargs):
+        super(GravNetConv, self).__init__(flow='target_to_source', **kwargs)
+
+        if knn is None:
+            raise ImportError('`GravNetConv` requires `torch-cluster`.')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.k = k
+        self.num_workers = num_workers
+
+        self.lin_s = Linear(in_channels, space_dimensions)
+        self.lin_h = Linear(in_channels, propagate_dimensions)
+        self.lin_p = Linear(propagate_dimensions, out_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_s.reset_parameters()
+        self.lin_h.reset_parameters()
+        self.lin_p.reset_parameters()
+
+
+    def forward(
+            self, x: Union[Tensor, PairTensor],
+            batch: Union[OptTensor, Optional[PairTensor]] = None) -> Tensor:
+        """"""
+
+        is_bipartite: bool = True
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+            is_bipartite = False
+        assert x[0].dim() == 2, 'Static graphs not supported in `GravNetConv`.'
+
+        b: PairOptTensor = (None, None)
+        if isinstance(batch, Tensor):
+            b = (batch, batch)
+        elif isinstance(batch, tuple):
+            assert batch is not None
+            b = (batch[0], batch[1])
+
+        h_l: Tensor = self.lin_h(x[0])
+
+        s_l: Tensor = self.lin_s(x[0])
+        s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
+
+        edge_index = knn(s_l, s_r, self.k, b[0], b[1],
+                         num_workers=self.num_workers)
+
+        edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
+        edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
+
+        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
+        out = self.propagate(edge_index, x=(h_l, None),
+                             edge_weight=edge_weight,
+                             size=(s_l.size(0), s_r.size(0)))
+
+        return self.lin_p(out), edge_index, edge_weight
+
+
+    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
+        return x_j * edge_weight.unsqueeze(1)
+
+    def aggregate(self, inputs: Tensor, index: Tensor,
+                  dim_size: Optional[int] = None) -> Tensor:
+        out_mean = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                           reduce='sum')
+        return out_mean
+
+    def __repr__(self):
+        return '{}({}, {}, k={})'.format(self.__class__.__name__,
+                                         self.in_channels, self.out_channels,
+                                         self.k)
diff --git a/mlpf/pytorch_delphes/model.py b/mlpf/pytorch_delphes/model.py
new file mode 100644
index 000000000..ac0cf251a
--- /dev/null
+++ b/mlpf/pytorch_delphes/model.py
@@ -0,0 +1,129 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
+from gravnet import GravNetConv
+from torch_geometric.nn import GraphConv
+
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True):
+
+        super(PFNet7, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+        self.elu = nn.ELU
+
+        # (1) DNN: encoding/decoding of all tracks and clusters
+        if self.nn1:
+            self.nn1 = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding),
+            )
+
+        # (2) CNN: Gravnet layer
+        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+        # (4) DNN layer: regressing p4
+        if self.nn3:
+            self.nn3 = nn.Sequential(
+                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, output_dim_p4),
+            )
+
+    def forward(self, data):
+        x0 = data.x
+
+        # Encoder/Decoder step
+        if self.nn1:
+            x = self.nn1(x0)
+        else:
+            x=x0
+
+        # Gravnet step
+        x, edge_index, edge_weight = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        if self.nn3:
+            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
+
+# # -------------------------------------------------------------------------------------
+# # uncomment to test a forward pass
+# from graph_data_delphes import PFGraphDataset
+# from data_preprocessing import data_to_loader_ttbar
+# from data_preprocessing import data_to_loader_qcd
+#
+# full_dataset = PFGraphDataset('../../../test_tmp_delphes/data/pythia8_ttbar')
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
+#
+# model = PFNet7()
+# model.to(device)
+#
+# for batch in train_loader:
+#     X = batch.to(device)
+#     pred_ids, pred_p4, gen_ids, gen_p4, cand_ids, cand_p4 = model(X)
+#     break
diff --git a/mlpf/pytorch_delphes/model_dnn.py b/mlpf/pytorch_delphes/model_dnn.py
new file mode 100644
index 000000000..338220112
--- /dev/null
+++ b/mlpf/pytorch_delphes/model_dnn.py
@@ -0,0 +1,126 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
+from gravnet import GravNetConv
+from torch_geometric.nn import GraphConv
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=8, propagate_dimensions=22, nearest=40,
+        target="gen", nn1=True, nn3=True, nn4=True):
+
+        super(PFNet7, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+        self.nn4 = nn4
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+
+        # (1) DNN
+        self.nn1 = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim_nn1),
+            self.act(0.5),
+            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+            self.act(0.5),
+            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+            self.act(0.5),
+        )
+
+        self.nn2 = nn.Sequential(
+            nn.Linear(hidden_dim_nn1 + input_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+        # (5) DNN layer: regressing p4
+        self.nn3 = nn.Sequential(
+            nn.Linear(encoding_dim + output_dim_id + input_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+        )
+
+        self.nn4 = nn.Sequential(
+            nn.Linear(hidden_dim + output_dim_id + input_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.act(0.5),
+            nn.Linear(hidden_dim, output_dim_p4),
+        )
+
+    def forward(self, data):
+
+        x0 = data.x
+
+        # Encoder/Decoder step
+        if self.nn1:
+            x = self.nn1(x0)
+
+        # DNN to predict PID
+        pred_ids = self.nn2(torch.cat([x, x0], axis=-1))
+
+        # DNN to predict p4
+        if self.nn3:
+            nn3_input = torch.cat([x, pred_ids, x0], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4=torch.zeros_like(data.ycand)
+
+        if self.nn4:
+            nn3_input = torch.cat([pred_p4, pred_ids, x0], axis=-1)
+            pred_p4 = self.nn4(nn3_input)
+        else:
+            pred_p4=torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
+
+# -------------------------------------------------------------------------------------
+# # uncomment to test a forward pass
+# from graph_data_delphes import PFGraphDataset
+# from data_preprocessing import data_to_loader_ttbar
+# from data_preprocessing import data_to_loader_qcd
+#
+# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
+#
+# print('Input to the network:', next(iter(train_loader)))
+#
+# model = PFNet7()
+#
+# for batch in train_loader:
+#     pred_ids, pred_p4, target_ids, target_p4, cand_ids, cand_p4 = model(batch)
+#     break
diff --git a/mlpf/pytorch_delphes/model_embeddings.py b/mlpf/pytorch_delphes/model_embeddings.py
new file mode 100644
index 000000000..89029e3b5
--- /dev/null
+++ b/mlpf/pytorch_delphes/model_embeddings.py
@@ -0,0 +1,177 @@
+import numpy as np
+import mplhep
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
+from gravnet import GravNetConv
+from torch_geometric.nn import GraphConv
+
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64, embedding_dim=3, encoding_of_clusters=True,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True, nn0track=True, nn0cluster=True):
+
+        super(PFNet7, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+        self.nn0track = nn0track
+        self.nn0cluster = nn0cluster
+        self.embedding_dim = embedding_dim
+        self.encoding_of_clusters = encoding_of_clusters
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+        self.elu = nn.ELU
+
+        # (0) DNN: encode the tracks from 12d -> 12d
+        if self.nn0track:
+            self.nn0track = nn.Sequential(
+                nn.Linear(12-1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding-1),
+            )
+
+        # (0) DNN: encode the clusters from 8d -> 12d
+        if self.nn0cluster:
+            self.nn0cluster = nn.Sequential(
+                nn.Linear(8-1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding-1),
+            )
+
+        # (0) DNN: embedding of "type"
+        if self.embedding_dim:
+            self.embedding = nn.Embedding(embedding_dim, 1)
+
+        # (1) DNN: encoding/decoding of all tracks and clusters
+        if self.nn1:
+            self.nn1 = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding),
+            )
+
+        # (2) CNN: Gravnet layer
+        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+        # (4) DNN layer: regressing p4
+        if self.nn3:
+            self.nn3 = nn.Sequential(
+                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, output_dim_p4),
+            )
+
+    def forward(self, data):
+        x0 = data.x
+
+        if self.encoding_of_clusters:
+        # encode the clusters onto a non-padded 12d features.. encode the tracks as well for equivalence
+            tracks = x0[:,1:][x0[:,0]==2]
+            clusters = x0[:,1:8][x0[:,0]==1]
+
+            if self.nn0track:
+                tracks = self.nn0track(tracks)
+
+            if self.nn0cluster:
+                clusters = self.nn0cluster(clusters)
+
+            tracks=torch.cat([x0[:,0][x0[:,0]==2].reshape(-1,1),tracks], axis=1)
+            clusters=torch.cat([x0[:,0][x0[:,0]==1].reshape(-1,1),clusters], axis=1)
+
+            x0 = torch.cat([tracks,clusters])
+
+        if self.embedding_dim:
+            # embed the "type" feature
+            add = self.embedding(x0[:,0].long()).reshape(-1,1)
+            x0=torch.cat([add,x0[:,1:]], axis=1)
+
+        # Encoder/Decoder step
+        if self.nn1:
+            x = self.nn1(x0)
+        else:
+            x=x0
+
+        # Gravnet step
+        x, edge_index, edge_weight = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        if self.nn3:
+            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
+
+# # -------------------------------------------------------------------------------------
+# # uncomment to test a forward pass
+# from graph_data_delphes import PFGraphDataset
+# from data_preprocessing import data_to_loader_ttbar
+# from data_preprocessing import data_to_loader_qcd
+#
+# full_dataset = PFGraphDataset('../../../test_tmp_delphes/data/pythia8_ttbar')
+#
+# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
+#
+# model = PFNet7()
+# model.to(device)
+#
+# for batch in train_loader:
+#     X = batch.to(device)
+#     pred_ids, pred_p4, gen_ids, gen_p4, cand_ids, cand_p4 = model(X)
+#     break
diff --git a/mlpf/pytorch_delphes/plots.py b/mlpf/pytorch_delphes/plots.py
new file mode 100644
index 000000000..82677234d
--- /dev/null
+++ b/mlpf/pytorch_delphes/plots.py
@@ -0,0 +1,528 @@
+import args
+from args import parse_args
+import sklearn
+import sklearn.metrics
+import numpy as np
+import pandas, mplhep
+import pickle as pkl
+import time, math
+
+import sys
+import os.path as osp
+sys.path.insert(1, '../../plotting/')
+sys.path.insert(1, '../../mlpf/plotting/')
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+import matplotlib
+import matplotlib.pyplot as plt
+import mpl_toolkits
+import mplhep as hep
+
+plt.style.use(hep.style.ROOT)
+
+elem_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+class_labels = [0, 1, 2, 3, 4, 5]
+
+#map these to ids 0...Nclass
+class_to_id = {r: class_labels[r] for r in range(len(class_labels))}
+# map these to ids 0...Nclass
+elem_to_id = {r: elem_labels[r] for r in range(len(elem_labels))}
+
+sample_title_qcd = "QCD, 14 TeV, PU200"
+sample_title_ttbar = "$t\\bar{t}$, 14 TeV, PU200"
+
+ranges = {
+    "pt": np.linspace(0, 10, 61),
+    "eta": np.linspace(-5, 5, 61),
+    "sphi": np.linspace(-1, 1, 61),
+    "cphi": np.linspace(-1, 1, 61),
+    "energy": np.linspace(0, 100, 61)
+}
+pid_names = {
+    0: "Null",
+    1: "Charged hadrons",
+    2: "Neutral hadrons",
+    3: "Photons",
+    4: "Electrons",
+    5: "Muons",
+}
+key_to_pid = {
+    "null": 0,
+    "chhadron": 1,
+    "nhadron": 2,
+    "photon": 3,
+    "electron": 4,
+    "muon": 5,
+}
+var_names = {
+    "pt": r"$p_\mathrm{T}$ [GeV]",
+    "eta": r"$\eta$",
+    "sphi": r"$\mathrm{sin} \phi$",
+    "cphi": r"$\mathrm{cos} \phi$",
+    "energy": r"$E$ [GeV]"
+}
+var_names_nounit = {
+    "pt": r"$p_\mathrm{T}$",
+    "eta": r"$\eta$",
+    "sphi": r"$\mathrm{sin} \phi$",
+    "cphi": r"$\mathrm{cos} \phi$",
+    "energy": r"$E$"
+}
+var_names_bare = {
+    "pt": "p_\mathrm{T}",
+    "eta": "\eta",
+    "energy": "E",
+}
+var_indices = {
+    "pt": 2,
+    "eta": 3,
+    "sphi": 4,
+    "cphi": 5,
+    "energy": 6
+}
+
+def deltaphi(phi1, phi2):
+    return np.fmod(phi1 - phi2 + np.pi, 2*np.pi) - np.pi
+
+def mse_unreduced(true, pred):
+    return torch.square(true-pred)
+
+# computes accuracy of PID predictions given a one_hot_embedding: truth & pred
+def accuracy(true_id, pred_id):
+    # revert one_hot_embedding
+    _, true_id = torch.max(true_id, -1)
+    _, pred_id = torch.max(pred_id, -1)
+
+    is_true = (true_id !=0)
+    is_same = (true_id == pred_id)
+
+    acc = (is_same&is_true).sum() / is_true.sum()
+    return acc
+
+# computes the resolution given a one_hot_embedding truth & pred + p4 of truth & pred
+def energy_resolution(true_id, true_p4, pred_id, pred_p4):
+    # revert one_hot_embedding
+    _,true_id= torch.max(true_id, -1)
+    _,pred_id = torch.max(pred_id, -1)
+
+    msk = (true_id!=0)
+
+    return mse_unreduced(true_p4[msk], pred_p4[msk])
+
+def plot_regression(val_x, val_y, var_name, rng, target, fname):
+    fig = plt.figure(figsize=(5,5))
+    plt.hist2d(
+        val_x,
+        val_y,
+        bins=(rng, rng),
+        cmap="Blues",
+        #norm=matplotlib.colors.LogNorm()
+    );
+
+    if target=='cand':
+        plt.xlabel("Cand {}".format(var_name))
+    elif target=='gen':
+        plt.xlabel("Gen {}".format(var_name))
+
+    plt.ylabel("MLPF {}".format(var_name))
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
+    return fig
+
+def plot_particles(fname, true_id, true_p4, pred_id, pred_p4, pid=1):
+    #Ground truth vs model prediction particles
+    fig = plt.figure(figsize=(10,10))
+
+    true_p4 = true_p4.detach().cpu().numpy()
+    pred_p4 = pred_p4.detach().cpu().numpy()
+
+    msk = (true_id == pid)
+    plt.scatter(true_p4[msk, 2], np.arctan2(true_p4[msk, 3], true_p4[msk, 4]), s=2*true_p4[msk, 2], marker="o", alpha=0.5)
+
+    msk = (pred_id == pid)
+    plt.scatter(pred_p4[msk, 2], np.arctan2(pred_p4[msk, 3], pred_p4[msk, 4]), s=2*pred_p4[msk, 2], marker="o", alpha=0.5)
+
+    plt.xlabel("eta")
+    plt.ylabel("phi")
+    plt.xlim(-5,5)
+    plt.ylim(-4,4)
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
+    return fig
+
+def plot_distribution(val_x, val_y, var_name, rng, target, fname):
+    plt.style.use(mplhep.style.CMS)
+
+    fig = plt.figure(figsize=(10,10))
+
+    if target=='cand':
+        plt.hist(val_x, bins=rng, density=True, histtype="step", lw=2, label="cand");
+    elif target=='gen':
+        plt.hist(val_x, bins=rng, density=True, histtype="step", lw=2, label="gen");
+
+    plt.hist(val_y, bins=rng, density=True, histtype="step", lw=2, label="MLPF");
+    plt.xlabel(var_name)
+    plt.legend(loc="best", frameon=False)
+    plt.ylim(0,1.5)
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
+    return fig
+
+def plot_distributions_pid(pid, true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath):
+    plt.style.use("default")
+
+    ch_true = true_p4[true_id==pid, 0].flatten().detach().cpu().numpy()
+    ch_pred = pred_p4[pred_id==pid, 0].flatten().detach().cpu().numpy()
+
+    pt_true = true_p4[true_id==pid, 1].flatten().detach().cpu().numpy()
+    pt_pred = pred_p4[pred_id==pid, 1].flatten().detach().cpu().numpy()
+
+    eta_true = true_p4[true_id==pid, 2].flatten().detach().cpu().numpy()
+    eta_pred = pred_p4[pred_id==pid, 2].flatten().detach().cpu().numpy()
+
+    sphi_true = true_p4[true_id==pid, 3].flatten().detach().cpu().numpy()
+    sphi_pred = pred_p4[pred_id==pid, 3].flatten().detach().cpu().numpy()
+
+    cphi_true = true_p4[true_id==pid, 4].flatten().detach().cpu().numpy()
+    cphi_pred = pred_p4[pred_id==pid, 4].flatten().detach().cpu().numpy()
+
+    e_true = true_p4[true_id==pid, 5].flatten().detach().cpu().numpy()
+    e_pred = pred_p4[pred_id==pid, 5].flatten().detach().cpu().numpy()
+
+    figure = plot_distribution(ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_charge_distribution')
+    figure = plot_distribution(pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_pt_distribution')
+    figure = plot_distribution(e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_energy_distribution')
+    figure = plot_distribution(eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_eta_distribution')
+    figure = plot_distribution(sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_sphi_distribution')
+    figure = plot_distribution(cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_cphi_distribution')
+
+def plot_distributions_all(true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath):
+    plt.style.use("default")
+
+    msk = (pred_id!=0) & (true_id!=0)
+
+    ch_true = true_p4[msk, 0].flatten().detach().cpu().numpy()
+    ch_pred = pred_p4[msk, 0].flatten().detach().cpu().numpy()
+
+    pt_true = true_p4[msk, 1].flatten().detach().cpu().numpy()
+    pt_pred = pred_p4[msk, 1].flatten().detach().cpu().numpy()
+
+    eta_true = true_p4[msk, 2].flatten().detach().cpu().numpy()
+    eta_pred = pred_p4[msk, 2].flatten().detach().cpu().numpy()
+
+    sphi_true = true_p4[msk, 3].flatten().detach().cpu().numpy()
+    sphi_pred = pred_p4[msk, 3].flatten().detach().cpu().numpy()
+
+    cphi_true = true_p4[msk, 4].flatten().detach().cpu().numpy()
+    cphi_pred = pred_p4[msk, 4].flatten().detach().cpu().numpy()
+
+    e_true = true_p4[msk, 5].flatten().detach().cpu().numpy()
+    e_pred = pred_p4[msk, 5].flatten().detach().cpu().numpy()
+
+    figure = plot_distribution(ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_charge_distribution')
+    figure = plot_distribution(pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_pt_distribution')
+    figure = plot_distribution(e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/all_energy_distribution')
+    figure = plot_distribution(eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/all_eta_distribution')
+    figure = plot_distribution(sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_sphi_distribution')
+    figure = plot_distribution(cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_cphi_distribution')
+
+def midpoints(x):
+    return x[:-1] + np.diff(x)/2
+
+def mask_empty(hist):
+    h0 = hist[0].astype(np.float64)
+    h0[h0<50] = 0
+    return (h0, hist[1])
+
+def divide_zero(a, b):
+    a = a.astype(np.float64)
+    b = b.astype(np.float64)
+    out = np.zeros_like(a)
+    np.divide(a, b, where=b>0, out=out)
+    return out
+
+def plot_pt_eta(ygen, legend_title=""):
+    plt.style.use(hep.style.ROOT)
+
+    b = np.linspace(0, 100, 41)
+
+    msk_pid1 = (ygen[:, 0]==1)
+    msk_pid2 = (ygen[:, 0]==2)
+    msk_pid3 = (ygen[:, 0]==3)
+    msk_pid4 = (ygen[:, 0]==4)
+    msk_pid5 = (ygen[:, 0]==5)
+
+    h1 = np.histogram(ygen[msk_pid1, 2], bins=b)
+    h2 = np.histogram(ygen[msk_pid2, 2], bins=b)
+    h3 = np.histogram(ygen[msk_pid3, 2], bins=b)
+    h4 = np.histogram(ygen[msk_pid4, 2], bins=b)
+    h5 = np.histogram(ygen[msk_pid5, 2], bins=b)
+
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))
+
+    xs = midpoints(h1[1])
+    width = np.diff(h1[1])
+
+    hep.histplot([h5[0], h4[0], h3[0], h2[0], h1[0]], bins=h1[1], ax=ax1, stack=True, histtype="fill",
+        label=["Muons", "Electrons", "Photons", "Neutral hadrons", "Charged hadrons"])
+
+    ax1.legend(loc="best", frameon=False, title=legend_title)
+    ax1.set_yscale("log")
+    ax1.set_ylim(1e1, 1e9)
+    ax1.set_xlabel(r"Truth particle $p_\mathrm{T}$ [GeV]")
+    ax1.set_ylabel("Truth particles")
+
+    b = np.linspace(-8, 8, 41)
+    h1 = np.histogram(ygen[msk_pid1, 3], bins=b)
+    h2 = np.histogram(ygen[msk_pid2, 3], bins=b)
+    h3 = np.histogram(ygen[msk_pid3, 3], bins=b)
+    h4 = np.histogram(ygen[msk_pid4, 3], bins=b)
+    h5 = np.histogram(ygen[msk_pid5, 3], bins=b)
+    xs = midpoints(h1[1])
+    width = np.diff(h1[1])
+
+    hep.histplot([h5[0], h4[0], h3[0], h2[0], h1[0]], bins=h1[1], ax=ax2, stack=True, histtype="fill",
+        label=["Muons", "Electrons", "Photons", "Neutral hadrons", "Charged hadrons"])
+    leg = ax2.legend(loc="best", frameon=False, ncol=2, title=legend_title)
+    leg._legend_box.align = "left"
+    ax2.set_yscale("log")
+    ax2.set_ylim(1e1, 1e9)
+    ax2.set_xlabel("Truth particle $\eta$")
+    ax2.set_ylabel("Truth particles")
+    return ax1, ax2
+
+def plot_num_particles_pid(list, key, ax=None, legend_title=""):
+    plt.style.use(hep.style.ROOT)
+
+    pid = key_to_pid[key]
+    if not ax:
+        plt.figure(figsize=(4,4))
+        ax = plt.axes()
+
+    cand_list = list[0]
+    target_list = list[1]
+    pf_list = list[2]
+
+    a = np.array(pf_list[key])
+    b = np.array(target_list[key])
+
+    ratio_dpf = (a - b) / b
+    ratio_dpf[ratio_dpf > 10] = 10
+    ratio_dpf[ratio_dpf < -10] = -10
+    mu_dpf = np.mean(ratio_dpf)
+    sigma_dpf = np.std(ratio_dpf)
+
+    ax.scatter(
+        target_list[key],
+        cand_list[key],
+        marker="o",
+        label="Rule-based PF, $r={0:.3f}$\n$\mu={1:.3f}\\ \sigma={2:.3f}$".format(
+            np.corrcoef(a, b)[0,1], mu_dpf, sigma_dpf
+        ),
+        alpha=0.5
+    )
+
+    c = np.array(cand_list[key])
+    b = np.array(target_list[key])
+
+    ratio_mlpf = (c - b) / b
+    ratio_mlpf[ratio_mlpf > 10] = 10
+    ratio_mlpf[ratio_mlpf < -10] = -10
+    mu_mlpf = np.mean(ratio_mlpf)
+    sigma_mlpf = np.std(ratio_mlpf)
+
+    ax.scatter(
+        target_list[key],
+        cand_list[key],
+        marker="^",
+        label="MLPF, $r={0:.3f}$\n$\mu={1:.3f}\\ \sigma={2:.3f}$".format(
+            np.corrcoef(a, b)[0,1], mu_mlpf, sigma_mlpf
+        ),
+        alpha=0.5
+    )
+
+    lims = [
+        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
+        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
+    ]
+    # now plot both limits against each other
+    ax.plot(lims, lims, '--', alpha=0.75, zorder=0)
+    ax.set_aspect('equal')
+    ax.set_xlim(lims)
+    ax.set_ylim(lims)
+    plt.tight_layout()
+    ax.legend(frameon=False, title=legend_title+pid_names[pid])
+    ax.set_xlabel("Truth particles / event")
+    ax.set_ylabel("Reconstructed particles / event")
+    plt.title("Particle multiplicity")
+
+def draw_efficiency_fakerate(ygen, ypred, ycand, pid, var, bins, outpath, both=True, legend_title=""):
+    var_idx = var_indices[var]
+
+    msk_gen = ygen[:, 0]==pid
+    msk_pred = ypred[:, 0]==pid
+    msk_cand = ycand[:, 0]==pid
+
+    hist_gen = np.histogram(ygen[msk_gen, var_idx], bins=bins);
+    hist_cand = np.histogram(ygen[msk_gen & msk_cand, var_idx], bins=bins);
+    hist_pred = np.histogram(ygen[msk_gen & msk_pred, var_idx], bins=bins);
+
+    hist_gen = mask_empty(hist_gen)
+    hist_cand = mask_empty(hist_cand)
+    hist_pred = mask_empty(hist_pred)
+
+    #efficiency plot
+    if both:
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))
+    else:
+        fig, ax1 = plt.subplots(1, 1, figsize=(8, 1*8))
+        ax2 = None
+
+    #ax1.set_title("reco efficiency for {}".format(pid_names[pid]))
+    ax1.errorbar(
+        midpoints(hist_gen[1]),
+        divide_zero(hist_cand[0], hist_gen[0]),
+        divide_zero(np.sqrt(hist_gen[0]), hist_gen[0]) * divide_zero(hist_cand[0], hist_gen[0]),
+        lw=0, label="Rule-based PF", elinewidth=2, marker=".",markersize=10)
+    ax1.errorbar(
+        midpoints(hist_gen[1]),
+        divide_zero(hist_pred[0], hist_gen[0]),
+        divide_zero(np.sqrt(hist_gen[0]), hist_gen[0]) * divide_zero(hist_pred[0], hist_gen[0]),
+        lw=0, label="MLPF", elinewidth=2, marker=".",markersize=10)
+    ax1.legend(frameon=False, loc=0, title=legend_title+pid_names[pid])
+    ax1.set_ylim(0,1.2)
+    # if var=="energy":
+    #     ax1.set_xlim(0,30)
+    ax1.set_xlabel(var_names[var])
+    ax1.set_ylabel("Efficiency")
+
+    hist_cand2 = np.histogram(ygen[msk_cand & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred2 = np.histogram(ygen[msk_pred & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_cand_gen2 = np.histogram(ygen[msk_cand & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred_gen2 = np.histogram(ygen[msk_pred & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+
+    hist_cand2 = mask_empty(hist_cand2)
+    hist_cand_gen2 = mask_empty(hist_cand_gen2)
+    hist_pred2 = mask_empty(hist_pred2)
+    hist_pred_gen2 = mask_empty(hist_pred_gen2)
+
+    if both:
+        #fake rate plot
+        #ax2.set_title("reco fake rate for {}".format(pid_names[pid]))
+        ax2.errorbar(
+            midpoints(hist_cand2[1]),
+            divide_zero(hist_cand_gen2[0], hist_cand2[0]),
+            divide_zero(np.sqrt(hist_cand_gen2[0]), hist_cand2[0]),
+            lw=0, label="Rule-based PF", elinewidth=2, marker=".",markersize=10)
+        ax2.errorbar(
+            midpoints(hist_pred2[1]),
+            divide_zero(hist_pred_gen2[0], hist_pred2[0]),
+            divide_zero(np.sqrt(hist_pred_gen2[0]), hist_pred2[0]),
+            lw=0, label="MLPF", elinewidth=2, marker=".",markersize=10)
+        ax2.legend(frameon=False, loc=0, title=legend_title+pid_names[pid])
+        ax2.set_ylim(0, 1.0)
+        #plt.yscale("log")
+        ax2.set_xlabel(var_names[var])
+        ax2.set_ylabel("Fake rate")
+
+    plt.savefig(outpath, bbox_inches="tight")
+    plt.close(fig)
+
+    return ax1, ax2
+
+def get_eff(ygen, ypred, ycand):
+    msk_gen = (ygen[:, 0]==pid) & (ygen[:, var_indices["pt"]]>5.0)
+    msk_pred = ypred[:, 0]==pid
+    msk_cand = ycand[:, 0]==pid
+
+    hist_gen = np.histogram(ygen[msk_gen, var_idx], bins=bins);
+    hist_cand = np.histogram(ygen[msk_gen & msk_cand, var_idx], bins=bins);
+    hist_pred = np.histogram(ygen[msk_gen & msk_pred, var_idx], bins=bins);
+
+    hist_gen = mask_empty(hist_gen)
+    hist_cand = mask_empty(hist_cand)
+    hist_pred = mask_empty(hist_pred)
+
+    return {
+        "x": midpoints(hist_gen[1]),
+        "y": divide_zero(hist_pred[0], hist_gen[0]),
+        "yerr": divide_zero(np.sqrt(hist_gen[0]), hist_gen[0]) * divide_zero(hist_pred[0], hist_gen[0])
+    }
+
+def get_fake(ygen, ypred, ycand):
+    msk_gen = ygen[:, 0]==pid
+    msk_pred = ypred[:, 0]==pid
+    msk_cand = ycand[:, 0]==pid
+
+    hist_cand2 = np.histogram(ygen[msk_cand & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred2 = np.histogram(ygen[msk_pred & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_cand_gen2 = np.histogram(ygen[msk_cand & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+    hist_pred_gen2 = np.histogram(ygen[msk_pred & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);
+
+    hist_cand2 = mask_empty(hist_cand2)
+    hist_cand_gen2 = mask_empty(hist_cand_gen2)
+    hist_pred2 = mask_empty(hist_pred2)
+    hist_pred_gen2 = mask_empty(hist_pred_gen2)
+
+    return {
+        "x": midpoints(hist_pred2[1]),
+        "y": divide_zero(hist_pred_gen2[0], hist_pred2[0]),
+        "yerr": divide_zero(np.sqrt(hist_pred_gen2[0]), hist_pred2[0])
+    }
+
+def plot_reso(ygen, ypred, ycand, pid, var, rng, ax=None, legend_title=""):
+    plt.style.use(hep.style.ROOT)
+
+    var_idx = var_indices[var]
+    msk = (ygen[:, 0]==pid) & (ycand[:, 0]==pid)
+    bins = np.linspace(-rng, rng, 100)
+    yg = ygen[msk, var_idx]
+    yp = ypred[msk, var_idx]
+
+    yc = ycand[msk, var_idx]
+    ratio_mlpf = (yp - yg) / yg
+    ratio_dpf = (yc - yg) / yg
+
+    #remove outliers for std value computation
+    outlier = 10
+    ratio_mlpf[ratio_mlpf<-outlier] = -outlier
+    ratio_mlpf[ratio_mlpf>outlier] = outlier
+    ratio_dpf[ratio_dpf<-outlier] = -outlier
+    ratio_dpf[ratio_dpf>outlier] = outlier
+
+    res_dpf = np.mean(ratio_dpf), np.std(ratio_dpf)
+    res_mlpf = np.mean(ratio_mlpf), np.std(ratio_mlpf)
+
+    if ax is None:
+        plt.figure(figsize=(4, 4))
+        ax = plt.axes()
+
+    #plt.title("{} resolution for {}".format(var_names_nounit[var], pid_names[pid]))
+    ax.hist(ratio_dpf, bins=bins, histtype="step", lw=2, label="Rule-based PF\n$\mu={:.2f},\\ \sigma={:.2f}$".format(*res_dpf));
+    ax.hist(ratio_mlpf, bins=bins, histtype="step", lw=2, label="MLPF\n$\mu={:.2f},\\ \sigma={:.2f}$".format(*res_mlpf));
+    ax.legend(frameon=False, title=legend_title+pid_names[pid])
+    ax.set_xlabel("{nounit} resolution, $({bare}^\prime - {bare})/{bare}$".format(nounit=var_names_nounit[var],bare=var_names_bare[var]))
+    ax.set_ylabel("Particles")
+    #plt.ylim(0, ax.get_ylim()[1]*2)
+    ax.set_ylim(1, 1e10)
+    ax.set_yscale("log")
+
+    return {"dpf": res_dpf, "mlpf": res_mlpf}
diff --git a/mlpf/pytorch_delphes/training.py b/mlpf/pytorch_delphes/training.py
new file mode 100644
index 000000000..3ebee8f30
--- /dev/null
+++ b/mlpf/pytorch_delphes/training.py
@@ -0,0 +1,508 @@
+from glob import glob
+import sys, os
+sys.path.insert(1, '../../plotting/')
+sys.path.insert(1, '../../mlpf/plotting/')
+
+import os.path as osp
+import pickle as pkl
+import math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+from plot_utils import plot_confusion_matrix
+
+import evaluate
+from evaluate import make_plots, make_predictions
+from model import PFNet7
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        alpha,
+        task,
+        title)
+    return model_fname
+
+def compute_weights(gen_ids_one_hot, device):
+    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
+    weights = torch.zeros(output_dim_id).to(device=device)
+    for k, v in zip(vs, cs):
+        weights[k] = 1.0/math.sqrt(float(v))
+    return weights
+
+def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
+    plt.style.use(hep.style.ROOT)
+
+    if not os.path.exists(outpath + '/training_plots/'):
+        os.makedirs(outpath + '/training_plots/')
+
+    fig, ax = plt.subplots()
+    ax.plot(range(len(l)), l, label=label)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.legend(loc='best')
+    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
+    plt.close(fig)
+
+    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
+        pkl.dump(l, f)
+
+@torch.no_grad()
+def test(model, loader, epoch, alpha, target_type, device):
+    with torch.no_grad():
+        ret = train(model, loader, epoch, None, alpha, target_type, device)
+    return ret
+
+def train(model, loader, epoch, optimizer, alpha, target_type, device):
+
+    is_train = not (optimizer is None)
+
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+
+    #loss values for each batch: classification, regression, total
+    losses_1, losses_2, losses_tot = [], [], []
+
+    #accuracy values for each batch (monitor classification performance)
+    accuracies_batch, accuracies_batch_msk = [], []
+
+    #setup confusion matrix
+    conf_matrix = np.zeros((output_dim_id, output_dim_id))
+
+    # to compute average inference time
+    t=[]
+
+    for i, batch in enumerate(loader):
+        t0 = time.time()
+
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        ## make like tensorflow model, 0-padding events to 6k elements
+        # if X.x.shape[0]<6000:
+        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
+        #
+        #     X.x = new_X
+        #     X.ygen_id=new_ygen_id
+
+        # Forwardprop
+        if i<10:
+            ti = time.time()
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+            tf = time.time()
+            t.append(round((tf-ti),2))
+        else:
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot, -1)
+        _, pred_ids = torch.max(pred_ids_one_hot, -1)
+        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
+
+        # masking
+        msk = ((pred_ids != 0) & (gen_ids != 0))
+        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
+
+        # computing loss
+        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
+        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
+        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
+
+        if args.classification_only:
+            loss = l1
+        else:
+            loss = l1+l2
+
+        if is_train:
+            # BACKPROP
+            #print(list(model.parameters())[1].grad)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        losses_1.append(l1.detach().cpu().item())
+        losses_2.append(l2.detach().cpu().item())
+        losses_tot.append(loss.detach().cpu().item())
+
+        t1 = time.time()
+
+        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
+        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
+
+        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
+                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
+
+        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
+
+    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
+
+    losses_1 = np.mean(losses_1)
+    losses_2 = np.mean(losses_2)
+    losses_tot = np.mean(losses_tot)
+
+    acc = np.mean(accuracies_batch)
+    acc_msk = np.mean(accuracies_batch_msk)
+
+    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
+
+    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
+
+
+def train_loop():
+    t0_initial = time.time()
+
+    losses_1_train, losses_2_train, losses_tot_train = [], [], []
+    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
+
+    accuracies_train, accuracies_msk_train = [], []
+    accuracies_valid, accuracies_msk_valid = [], []
+
+    best_val_loss = 99999.9
+    stale_epochs = 0
+
+    print("Training over {} epochs".format(args.n_epochs))
+    for epoch in range(args.n_epochs):
+        t0 = time.time()
+
+        if stale_epochs > patience:
+            print("breaking due to stale epochs")
+            break
+
+        # training epoch
+        model.train()
+        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
+
+        losses_tot_train.append(losses_tot)
+        losses_1_train.append(losses_1)
+        losses_2_train.append(losses_2)
+
+        accuracies_train.append(acc)
+        accuracies_msk_train.append(acc_msk)
+
+        # validation step
+        model.eval()
+        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
+
+        losses_tot_valid.append(losses_tot_v)
+        losses_1_valid.append(losses_1_v)
+        losses_2_valid.append(losses_2_v)
+
+        accuracies_valid.append(acc_v)
+        accuracies_msk_valid.append(acc_msk_v)
+
+        # early-stopping
+        if losses_tot_v < best_val_loss:
+            best_val_loss = losses_tot_v
+            stale_epochs = 0
+        else:
+            stale_epochs += 1
+
+        t1 = time.time()
+
+        epochs_remaining = args.n_epochs - (epoch+1)
+        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
+        eta = epochs_remaining*time_per_epoch/60
+
+        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
+            epoch+1, args.n_epochs,
+            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
+            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
+
+        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
+
+        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
+        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
+
+        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
+        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
+
+    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
+    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
+    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
+
+    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
+    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
+    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
+
+    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
+    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
+
+    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
+    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
+
+    print('Done with training.')
+
+    return
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
+    # 'load': False, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
+    # 'make_predictions_train': False, 'make_plots_train': False, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'hidden_dim_nn1': args.hidden_dim_nn1,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'space_dim': args.space_dim,
+                    'propagate_dimensions': args.propagate_dimensions,
+                    'nearest': args.nearest,
+                    'target': args.target,
+                    'nn1': args.nn1,
+                    'nn3': args.nn3}
+
+    if args.load:
+            print('Loading a previously trained model..')
+            model = model_class(**model_kwargs)
+            outpath = args.outpath + args.load_model
+            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+            state_dict = torch.load(PATH, map_location=device)
+
+            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
+                state_dict = torch.load(PATH, map_location=device)
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove module.
+                    new_state_dict[name] = v
+                    # print('name is:', name)
+                state_dict=new_state_dict
+
+            model.load_state_dict(state_dict)
+
+            if multi_gpu:
+                model = torch_geometric.nn.DataParallel(model)
+                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+            model.to(device)
+
+            if args.train:
+                print("Training a previously trained model..")
+
+    elif args.train:
+        #instantiate the model
+        print('Instantiating a model..')
+        model = model_class(**model_kwargs)
+
+        if multi_gpu:
+            print("Parallelizing the training..")
+            model = torch_geometric.nn.DataParallel(model)
+            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+        model.to(device)
+
+    if args.train:
+        args.title=args.title+'noskip'
+        if args.nn1:
+            args.title=args.title+'_nn1'
+        if args.nn3:
+            args.title=args.title+'_nn3'
+
+        if args.classification_only:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
+        else:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
+
+        outpath = osp.join(args.outpath, model_fname)
+        if osp.isdir(outpath):
+            if args.overwrite:
+                print("model output {} already exists, deleting it".format(outpath))
+                import shutil
+                shutil.rmtree(outpath)
+            else:
+                print("model output {} already exists, please delete it".format(outpath))
+                sys.exit(0)
+        try:
+            os.makedirs(outpath)
+        except Exception as e:
+            pass
+
+        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
+            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
+
+        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
+            os.makedirs(outpath + '/confusion_matrix_plots/')
+
+        if args.optimizer == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+        elif args.optimizer == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+
+        print(model)
+        print(model_fname)
+
+        model.train()
+        train_loop()
+
+    model.eval()
+
+    # evaluate on training data..
+    if not osp.isdir(outpath+'/train_loader'):
+        os.makedirs(outpath+'/train_loader')
+    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
+        os.makedirs(outpath+'/train_loader/resolution_plots')
+    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
+        os.makedirs(outpath+'/train_loader/distribution_plots')
+    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/train_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
+        os.makedirs(outpath+'/train_loader/efficiency_plots')
+
+    if args.make_predictions_train:
+        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+    if args.make_plots_train:
+        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+
+    # evaluate on validation data..
+    if not osp.isdir(outpath+'/valid_loader'):
+        os.makedirs(outpath+'/valid_loader')
+    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
+        os.makedirs(outpath+'/valid_loader/resolution_plots')
+    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
+        os.makedirs(outpath+'/valid_loader/distribution_plots')
+    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
+        os.makedirs(outpath+'/valid_loader/efficiency_plots')
+
+    if args.make_predictions_valid:
+        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+    if args.make_plots_valid:
+        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+
+    # evaluate on testing data..
+    if not osp.isdir(outpath+'/test_loader'):
+        os.makedirs(outpath+'/test_loader')
+    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
+        os.makedirs(outpath+'/test_loader/resolution_plots')
+    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
+        os.makedirs(outpath+'/test_loader/distribution_plots')
+    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/test_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
+        os.makedirs(outpath+'/test_loader/efficiency_plots')
+
+    if args.make_predictions_test:
+        if args.load:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+    if args.make_plots_test:
+        if args.load:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+
+
+## -----------------------------------------------------------
+# to retrieve a stored variable in pkl file
+# import pickle as pkl
+# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     a = pkl.load(f)
+#
+# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
+#     data = pkl.load(pickle_file)
+#
+# data.keys()
diff --git a/mlpf/pytorch_delphes/training_dnn.py b/mlpf/pytorch_delphes/training_dnn.py
new file mode 100644
index 000000000..16315eb75
--- /dev/null
+++ b/mlpf/pytorch_delphes/training_dnn.py
@@ -0,0 +1,497 @@
+from glob import glob
+import sys, os
+sys.path.insert(1, '../../plotting/')
+sys.path.insert(1, '../../mlpf/plotting/')
+
+import os.path as osp
+import pickle as pkl
+import math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+import evaluate
+from evaluate import make_plots, make_predictions
+from plot_utils import plot_confusion_matrix
+from model_dnn import PFNet7
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        alpha,
+        task,
+        title)
+    return model_fname
+
+def compute_weights(target_ids_one_hot, device):
+    vs, cs = torch.unique(target_ids_one_hot, return_counts=True)
+    weights = torch.zeros(output_dim_id).to(device=device)
+    for k, v in zip(vs, cs):
+        weights[k] = 1.0/math.sqrt(float(v))
+    return weights
+
+def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
+    if not os.path.exists(outpath + '/training_plots/'):
+        os.makedirs(outpath + '/training_plots/')
+
+    fig, ax = plt.subplots()
+    ax.plot(range(len(l)), l, label=label)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.legend(loc='best')
+    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
+    plt.close(fig)
+
+    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
+        pkl.dump(l, f)
+
+@torch.no_grad()
+def test(model, loader, epoch, alpha, target_type, device):
+    with torch.no_grad():
+        ret = train(model, loader, epoch, None, alpha, target_type, device)
+    return ret
+
+def train(model, loader, epoch, optimizer, alpha, target_type, device):
+
+    is_train = not (optimizer is None)
+
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+
+    #loss values for each batch: classification, regression, total
+    losses_1, losses_2, losses_tot = [], [], []
+
+    #accuracy values for each batch (monitor classification performance)
+    accuracies_batch, accuracies_batch_msk = [], []
+
+    #setup confusion matrix
+    conf_matrix = np.zeros((output_dim_id, output_dim_id))
+
+    # to compute average inference time
+    t=[]
+
+    for i, batch in enumerate(loader):
+        t0 = time.time()
+
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        ## make like tensorflow model, 0-padding events to 6k elements
+        # if X.x.shape[0]<6000:
+        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
+        #
+        #     X.x = new_X
+        #     X.ygen_id=new_ygen_id
+
+        # Forwardprop
+        if i<10:
+            ti = time.time()
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+            tf = time.time()
+            t.append(round((tf-ti),2))
+        else:
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot, -1)
+        _, pred_ids = torch.max(pred_ids_one_hot, -1)
+        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
+
+        # masking
+        msk = ((pred_ids != 0) & (gen_ids != 0))
+        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
+
+        # computing loss
+        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
+        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
+        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
+
+        if args.classification_only:
+            loss = l1
+        else:
+            loss = l1+l2
+
+        losses_1.append(l1.item())
+        losses_2.append(l2.item())
+        losses_tot.append(loss.item())
+
+        if is_train:
+            # BACKPROP
+            #print(list(model.parameters())[1].grad)
+            a = list(model.parameters())[1].clone()
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            b = list(model.parameters())[1].clone()
+            if torch.equal(a.data, b.data):
+                print('Model is not learning.. weights are not updating..')
+
+        t1 = time.time()
+
+        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
+        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
+
+        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
+                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
+
+        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.item(), t1-t0), end='\r', flush=True)
+
+    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
+
+    losses_1 = np.mean(losses_1)
+    losses_2 = np.mean(losses_2)
+    losses_tot = np.mean(losses_tot)
+
+    acc = np.mean(accuracies_batch)
+    acc_msk = np.mean(accuracies_batch_msk)
+
+    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
+
+    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
+
+
+def train_loop():
+    t0_initial = time.time()
+
+    losses_1_train, losses_2_train, losses_tot_train = [], [], []
+    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
+
+    accuracies_train, accuracies_msk_train = [], []
+    accuracies_valid, accuracies_msk_valid = [], []
+
+    best_val_loss = 99999.9
+    stale_epochs = 0
+
+    print("Training over {} epochs".format(args.n_epochs))
+    for epoch in range(args.n_epochs):
+        t0 = time.time()
+
+        if stale_epochs > patience:
+            print("breaking due to stale epochs")
+            break
+
+        # training epoch
+        model.train()
+        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
+
+        losses_tot_train.append(losses_tot)
+        losses_1_train.append(losses_1)
+        losses_2_train.append(losses_2)
+
+        accuracies_train.append(acc)
+        accuracies_msk_train.append(acc_msk)
+
+        # validation step
+        model.eval()
+        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
+
+        losses_tot_valid.append(losses_tot_v)
+        losses_1_valid.append(losses_1_v)
+        losses_2_valid.append(losses_2_v)
+
+        accuracies_valid.append(acc_v)
+        accuracies_msk_valid.append(acc_msk_v)
+
+        # early-stopping
+        if losses_tot_v < best_val_loss:
+            best_val_loss = losses_tot_v
+            stale_epochs = 0
+        else:
+            stale_epochs += 1
+
+        t1 = time.time()
+
+        epochs_remaining = args.n_epochs - (epoch+1)
+        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
+        eta = epochs_remaining*time_per_epoch/60
+
+        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
+            epoch+1, args.n_epochs,
+            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
+            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
+
+        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
+
+        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
+        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
+
+        with open(outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pkl', 'wb') as f:
+            pkl.dump(conf_matrix_norm, f)
+
+        with open(outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl', 'wb') as f:
+            pkl.dump(conf_matrix_norm_v, f)
+
+    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
+    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
+    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
+
+    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
+    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
+    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
+
+    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
+    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
+
+    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
+    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
+
+    print('Done with training.')
+
+    return
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 1, 'patience': 100, 'hidden_dim':256, 'hidden_dim_nn1':64, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4, 'overwrite': True,
+    # 'load': False, 'load_epoch': 0, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4',
+    # 'classification_only': False, 'nn1': True, 'nn3': True, 'nn4': True, 'title': 'dnn',
+    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': True, 'make_plots_valid': True, 'make_predictions_test': True, 'make_plots_test': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'hidden_dim_nn1': args.hidden_dim_nn1,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'target': args.target,
+                    'nn1': args.nn1,
+                    'nn3': args.nn3,
+                    'nn4': args.nn4}
+
+    if args.load:
+            print('Loading a previously trained model..')
+            model = model_class(**model_kwargs)
+            outpath = args.outpath + args.load_model
+            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+            state_dict = torch.load(PATH, map_location=device)
+
+            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
+                state_dict = torch.load(PATH, map_location=device)
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove module.
+                    new_state_dict[name] = v
+                    # print('name is:', name)
+                state_dict=new_state_dict
+
+            model.load_state_dict(state_dict)
+
+            if args.train:
+                print("Training a previously trained model..")
+
+    elif args.train:
+        #instantiate the model
+        print('Instantiating a model..')
+        model = model_class(**model_kwargs)
+
+        if multi_gpu:
+            print("Parallelizing the training..")
+            model = torch_geometric.nn.DataParallel(model)
+            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+        model.to(device)
+
+    if args.train:
+        args.title=args.title+'noskip'
+        if args.nn1:
+            args.title=args.title+'_nn1'
+        if args.nn3:
+            args.title=args.title+'_nn3'
+        if args.nn4:
+            args.title=args.title+'_nn4'
+
+        if args.classification_only:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
+        else:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
+
+        outpath = osp.join(args.outpath, model_fname)
+        if osp.isdir(outpath):
+            if args.overwrite:
+                print("model output {} already exists, deleting it".format(outpath))
+                import shutil
+                shutil.rmtree(outpath)
+            else:
+                print("model output {} already exists, please delete it".format(outpath))
+                sys.exit(0)
+        try:
+            os.makedirs(outpath)
+        except Exception as e:
+            pass
+
+        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
+            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
+
+        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
+            os.makedirs(outpath + '/confusion_matrix_plots/')
+
+        if args.optimizer == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+        elif args.optimizer == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+
+        print(model)
+        print(model_fname)
+
+        model.train()
+        train_loop()
+        model.eval()
+
+    # evaluate on training data..
+    if not osp.isdir(outpath+'/train_loader'):
+        os.makedirs(outpath+'/train_loader')
+    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
+        os.makedirs(outpath+'/train_loader/resolution_plots')
+    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
+        os.makedirs(outpath+'/train_loader/distribution_plots')
+    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/train_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
+        os.makedirs(outpath+'/train_loader/efficiency_plots')
+
+    if args.make_predictions_train:
+        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+    if args.make_plots_train:
+        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+
+    # evaluate on validation data..
+    if not osp.isdir(outpath+'/valid_loader'):
+        os.makedirs(outpath+'/valid_loader')
+    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
+        os.makedirs(outpath+'/valid_loader/resolution_plots')
+    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
+        os.makedirs(outpath+'/valid_loader/distribution_plots')
+    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
+        os.makedirs(outpath+'/valid_loader/efficiency_plots')
+
+    if args.make_predictions_valid:
+        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+    if args.make_plots_valid:
+        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+
+    # evaluate on testing data..
+    if not osp.isdir(outpath+'/test_loader'):
+        os.makedirs(outpath+'/test_loader')
+    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
+        os.makedirs(outpath+'/test_loader/resolution_plots')
+    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
+        os.makedirs(outpath+'/test_loader/distribution_plots')
+    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/test_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
+        os.makedirs(outpath+'/test_loader/efficiency_plots')
+
+    if args.make_predictions_test:
+        if args.load:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+    if args.make_plots_test:
+        if args.load:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+
+## -----------------------------------------------------------
+# # to retrieve a stored variable in pkl file
+# import pkl
+# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     a = pkl.load(f)
diff --git a/mlpf/pytorch_delphes/training_embeddings.py b/mlpf/pytorch_delphes/training_embeddings.py
new file mode 100644
index 000000000..7d95863b8
--- /dev/null
+++ b/mlpf/pytorch_delphes/training_embeddings.py
@@ -0,0 +1,512 @@
+from glob import glob
+import sys, os
+sys.path.insert(1, '../../plotting/')
+sys.path.insert(1, '../../mlpf/plotting/')
+
+import os.path as osp
+import pickle as pkl
+import math, time, numba, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+from plot_utils import plot_confusion_matrix
+
+import evaluate
+from evaluate import make_plots, make_predictions
+from model_embeddings import PFNet7
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        alpha,
+        task,
+        title)
+    return model_fname
+
+def compute_weights(gen_ids_one_hot, device):
+    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
+    weights = torch.zeros(output_dim_id).to(device=device)
+    for k, v in zip(vs, cs):
+        weights[k] = 1.0/math.sqrt(float(v))
+    return weights
+
+def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
+    plt.style.use(hep.style.ROOT)
+
+    if not os.path.exists(outpath + '/training_plots/'):
+        os.makedirs(outpath + '/training_plots/')
+
+    fig, ax = plt.subplots()
+    ax.plot(range(len(l)), l, label=label)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.legend(loc='best')
+    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
+    plt.close(fig)
+
+    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
+        pkl.dump(l, f)
+
+@torch.no_grad()
+def test(model, loader, epoch, alpha, target_type, device):
+    with torch.no_grad():
+        ret = train(model, loader, epoch, None, alpha, target_type, device)
+    return ret
+
+def train(model, loader, epoch, optimizer, alpha, target_type, device):
+
+    is_train = not (optimizer is None)
+
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+
+    #loss values for each batch: classification, regression, total
+    losses_1, losses_2, losses_tot = [], [], []
+
+    #accuracy values for each batch (monitor classification performance)
+    accuracies_batch, accuracies_batch_msk = [], []
+
+    #setup confusion matrix
+    conf_matrix = np.zeros((output_dim_id, output_dim_id))
+
+    # to compute average inference time
+    t=[]
+
+    for i, batch in enumerate(loader):
+        t0 = time.time()
+
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        ## make like tensorflow model, 0-padding events to 6k elements
+        # if X.x.shape[0]<6000:
+        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
+        #
+        #     X.x = new_X
+        #     X.ygen_id=new_ygen_id
+
+        # Forwardprop
+        if i<10:
+            ti = time.time()
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+            tf = time.time()
+            t.append(round((tf-ti),2))
+        else:
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot, -1)
+        _, pred_ids = torch.max(pred_ids_one_hot, -1)
+        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
+
+        # masking
+        msk = ((pred_ids != 0) & (gen_ids != 0))
+        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
+
+        # computing loss
+        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
+        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
+        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
+
+        if args.classification_only:
+            loss = l1
+        else:
+            loss = l1+l2
+
+        if is_train:
+            # BACKPROP
+            #print(list(model.parameters())[1].grad)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        losses_1.append(l1.detach().cpu().item())
+        losses_2.append(l2.detach().cpu().item())
+        losses_tot.append(loss.detach().cpu().item())
+
+        t1 = time.time()
+
+        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
+        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
+
+        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
+                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
+
+        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
+
+    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
+
+    losses_1 = np.mean(losses_1)
+    losses_2 = np.mean(losses_2)
+    losses_tot = np.mean(losses_tot)
+
+    acc = np.mean(accuracies_batch)
+    acc_msk = np.mean(accuracies_batch_msk)
+
+    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
+
+    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
+
+
+def train_loop():
+    t0_initial = time.time()
+
+    losses_1_train, losses_2_train, losses_tot_train = [], [], []
+    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
+
+    accuracies_train, accuracies_msk_train = [], []
+    accuracies_valid, accuracies_msk_valid = [], []
+
+    best_val_loss = 99999.9
+    stale_epochs = 0
+
+    print("Training over {} epochs".format(args.n_epochs))
+    for epoch in range(args.n_epochs):
+        t0 = time.time()
+
+        if stale_epochs > patience:
+            print("breaking due to stale epochs")
+            break
+
+        # training epoch
+        model.train()
+        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
+
+        losses_tot_train.append(losses_tot)
+        losses_1_train.append(losses_1)
+        losses_2_train.append(losses_2)
+
+        accuracies_train.append(acc)
+        accuracies_msk_train.append(acc_msk)
+
+        # validation step
+        model.eval()
+        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
+
+        losses_tot_valid.append(losses_tot_v)
+        losses_1_valid.append(losses_1_v)
+        losses_2_valid.append(losses_2_v)
+
+        accuracies_valid.append(acc_v)
+        accuracies_msk_valid.append(acc_msk_v)
+
+        # early-stopping
+        if losses_tot_v < best_val_loss:
+            best_val_loss = losses_tot_v
+            stale_epochs = 0
+        else:
+            stale_epochs += 1
+
+        t1 = time.time()
+
+        epochs_remaining = args.n_epochs - (epoch+1)
+        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
+        eta = epochs_remaining*time_per_epoch/60
+
+        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
+            epoch+1, args.n_epochs,
+            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
+            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
+
+        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
+
+        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
+        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
+
+        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
+        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
+
+    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
+    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
+    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
+
+    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
+    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
+    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
+
+    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
+    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
+
+    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
+    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
+
+    print('Done with training.')
+
+    return
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
+    # 'load': False, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
+    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': True, 'make_plots_valid': True, 'make_predictions_test': True, 'make_plots_test': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'hidden_dim_nn1': args.hidden_dim_nn1,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'embedding_dim': args.embedding_dim,
+                    'encoding_of_clusters': args.encoding_of_clusters,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'space_dim': args.space_dim,
+                    'propagate_dimensions': args.propagate_dimensions,
+                    'nearest': args.nearest,
+                    'target': args.target,
+                    'nn1': args.nn1,
+                    'nn3': args.nn3,
+                    'nn0track': args.nn0track,
+                    'nn0cluster': args.nn0cluster}
+
+    if args.load:
+            print('Loading a previously trained model..')
+            model = model_class(**model_kwargs)
+            outpath = args.outpath + args.load_model
+            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+            state_dict = torch.load(PATH, map_location=device)
+
+            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
+                state_dict = torch.load(PATH, map_location=device)
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove module.
+                    new_state_dict[name] = v
+                    # print('name is:', name)
+                state_dict=new_state_dict
+
+            model.load_state_dict(state_dict)
+
+            if multi_gpu:
+                model = torch_geometric.nn.DataParallel(model)
+                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+            model.to(device)
+
+            if args.train:
+                print("Training a previously trained model..")
+
+    elif args.train:
+        #instantiate the model
+        print('Instantiating a model..')
+        model = model_class(**model_kwargs)
+
+        if multi_gpu:
+            print("Parallelizing the training..")
+            model = torch_geometric.nn.DataParallel(model)
+            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+        model.to(device)
+
+    if args.train:
+        args.title=args.title+'noskip'
+        if args.nn1:
+            args.title=args.title+'_nn1'
+        if args.nn3:
+            args.title=args.title+'_nn3'
+
+        if args.classification_only:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
+        else:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
+
+        outpath = osp.join(args.outpath, model_fname)
+        if osp.isdir(outpath):
+            if args.overwrite:
+                print("model output {} already exists, deleting it".format(outpath))
+                import shutil
+                shutil.rmtree(outpath)
+            else:
+                print("model output {} already exists, please delete it".format(outpath))
+                sys.exit(0)
+        try:
+            os.makedirs(outpath)
+        except Exception as e:
+            pass
+
+        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
+            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
+
+        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
+            os.makedirs(outpath + '/confusion_matrix_plots/')
+
+        if args.optimizer == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+        elif args.optimizer == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+
+        print(model)
+        print(model_fname)
+
+        model.train()
+        train_loop()
+
+    model.eval()
+
+    # evaluate on training data..
+    if not osp.isdir(outpath+'/train_loader'):
+        os.makedirs(outpath+'/train_loader')
+    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
+        os.makedirs(outpath+'/train_loader/resolution_plots')
+    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
+        os.makedirs(outpath+'/train_loader/distribution_plots')
+    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/train_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
+        os.makedirs(outpath+'/train_loader/efficiency_plots')
+
+    if args.make_predictions_train:
+        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+    if args.make_plots_train:
+        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+
+    # evaluate on validation data..
+    if not osp.isdir(outpath+'/valid_loader'):
+        os.makedirs(outpath+'/valid_loader')
+    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
+        os.makedirs(outpath+'/valid_loader/resolution_plots')
+    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
+        os.makedirs(outpath+'/valid_loader/distribution_plots')
+    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
+        os.makedirs(outpath+'/valid_loader/efficiency_plots')
+
+    if args.make_predictions_valid:
+        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+    if args.make_plots_valid:
+        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+
+    # evaluate on testing data..
+    if not osp.isdir(outpath+'/test_loader'):
+        os.makedirs(outpath+'/test_loader')
+    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
+        os.makedirs(outpath+'/test_loader/resolution_plots')
+    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
+        os.makedirs(outpath+'/test_loader/distribution_plots')
+    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/test_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
+        os.makedirs(outpath+'/test_loader/efficiency_plots')
+
+    if args.make_predictions_test:
+        if args.load:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+    if args.make_plots_test:
+        if args.load:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+
+
+## -----------------------------------------------------------
+# to retrieve a stored variable in pkl file
+# import pickle as pkl
+# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     a = pkl.load(f)
+#
+# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
+#     data = pkl.load(pickle_file)
+#
+# data.keys()
diff --git a/scripts/get_all_data_delphes.sh b/scripts/get_all_data_delphes.sh
new file mode 100644
index 000000000..a5c57d547
--- /dev/null
+++ b/scripts/get_all_data_delphes.sh
@@ -0,0 +1,53 @@
+# this script assumes you git cloned the repo and are inside the particleflow/scripts directory
+# you can run the script using ./get_all_data_delphes.sh
+
+#!/bin/bash
+set -e
+
+rm -Rf test_tmp_delphes
+mkdir test_tmp_delphes
+cd test_tmp_delphes
+
+mkdir -p experiments
+
+mkdir -p data/pythia8_ttbar
+mkdir -p data/pythia8_ttbar/raw
+mkdir -p data/pythia8_ttbar/processed
+
+mkdir -p data/pythia8_qcd
+mkdir -p data/pythia8_qcd/raw
+mkdir -p data/pythia8_qcd/processed
+
+# now get the ttbar data for training/testing
+cd data/pythia8_ttbar/raw/
+
+for j in {0..9}
+do
+  for i in {0..49}
+  do
+    wget --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_"$j"_"$i".pkl.bz2
+  done
+done
+
+bzip2 -d *
+
+# now get the qcd data for extra validation
+cd ../../pythia8_qcd/raw/
+
+for i in {0..49}
+do
+    wget --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_"$i".pkl.bz2
+done
+
+bzip2 -d *
+
+# be in test_tmp_delphes when you process the files.. so the next cd tries to ensure that..
+cd ../../../
+
+#generate pytorch data files from pkl files
+python3 ../particleflow/mlpf/pytorch/graph_data_delphes.py --dataset data/pythia8_ttbar \
+  --processed_dir data/pythia8_ttbar/processed --num-files-merge 1 --num-proc 1
+
+#generate pytorch data files from pkl files
+python3 ../particleflow/mlpf/pytorch/graph_data_delphes.py --dataset data/pythia8_qcd \
+  --processed_dir data/pythia8_qcd/processed --num-files-merge 1 --num-proc 1
diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index f357aead0..0233ad793 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+# !/bin/bash
 set -e
 
 rm -Rf test_tmp_delphes
@@ -10,32 +10,44 @@ mkdir -p data/pythia8_ttbar
 mkdir -p data/pythia8_ttbar/raw
 mkdir -p data/pythia8_ttbar/processed
 
-cd data/pythia8_ttbar/raw
+mkdir -p data/pythia8_qcd
+mkdir -p data/pythia8_qcd/raw
+mkdir -p data/pythia8_qcd/processed
 
-#download some pickle data files (for this test we download 3 pkl files and allocate 2 for train and 1 for valid)
+# download 2 files for training/validation
+cd data/pythia8_ttbar/raw
+echo Downloading the training/validation data files..
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2
-wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_2.pkl.bz2
-
+bzip2 -d *
 cd ../../..
 
-# # if you have the data in place and want to avoid downloading it you can comment all of the above and uncomment the next line
-# cd test_tmp_delphes
+# download 1 file for testing
+cd data/pythia8_qcd/raw
+echo Downloading the testing data files..
+wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
+bzip2 -d *
+cd ../../..
 
 #generate pytorch data files from pkl files
-python3 ../mlpf/pytorch/graph_data_delphes.py --dataset data/pythia8_ttbar \
+echo Processing the training/validation data files..
+python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_ttbar \
   --processed_dir data/pythia8_ttbar/processed --num-files-merge 1 --num-proc 1
 
+#generate pytorch data files from pkl files
+echo Processing the testing data files..
+python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_qcd/ \
+  --processed_dir data/pythia8_qcd/processed --num-files-merge 1 --num-proc 1
+
 # before training a model, first get rid of any previous models stored
-rm -Rf experiments/PFNet*
+rm -Rf experiments/*
+
+cd ../mlpf/pytorch_delphes/
 
 #run the pytorch training
-COMET_API_KEY="bla" python3 ../mlpf/pytorch/train_end2end_delphes.py \
-  --dataset data/pythia8_ttbar --space_dim 2 --n_train 1 \
-  --n_val 1 --model PFNet7 --convlayer gravnet-radius --convlayer2 "none" \
-  --lr 0.0001 --hidden_dim 32 --n_epochs 3 --l1 1.0 --l2 0.001 --target gen \
-  --batch_size 1 --dropout 0.2 --disable_comet
-
-# predict on some test data and make plots
-python3 ../mlpf/pytorch/eval_end2end_delphes.py --dataset data/pythia8_ttbar \
-  --path experiments/PFNet* --model PFNet7 --start 1 --stop 2 --epoch 1 --target gen
+echo Begining the training..
+python3 training.py \
+  --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
+  --dataset='../../test_tmp_delphes/data/pythia8_ttbar' \
+  --dataset_qcd='../../test_tmp_delphes/data/pythia8_qcd' \
+  --outpath='../../test_tmp_delphes/experiments'

From c5419404434dbab2984548a68b10871ebc60585b Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Thu, 22 Jul 2021 18:09:11 -0700
Subject: [PATCH 004/157] fixed confusion matrix

---
 mlpf/plotting/plot_utils.py | 38 +++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py
index 6a1bb20ee..6e6400ac8 100644
--- a/mlpf/plotting/plot_utils.py
+++ b/mlpf/plotting/plot_utils.py
@@ -1,6 +1,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
-import mplhep
+import mplhep as hep
 import os.path as osp
 
 pid_to_text = {
@@ -147,8 +147,8 @@ def sample_label(ax, y=0.98):
 def particle_label(ax, pid):
     plt.text(0.03, 0.92, pid_to_text[pid], va="top", ha="left", size=10, transform=ax.transAxes)
 
-def plot_confusion_matrix(cm,
-                          target_names,
+def plot_confusion_matrix(cm, target_names,
+                          fname, epoch,
                           title='Confusion matrix',
                           cmap=None,
                           normalize=True):
@@ -187,9 +187,11 @@ def plot_confusion_matrix(cm,
     import matplotlib.pyplot as plt
     import numpy as np
     import itertools
+    plt.style.use('default')
 
-    accuracy = np.trace(cm) / float(np.sum(cm))
-    misclass = 1 - accuracy
+    # # only true if it weren't normalized:
+    # accuracy = np.trace(cm) / float(np.sum(cm))
+    # misclass = 1 - accuracy
 
     if cmap is None:
         cmap = plt.get_cmap('Blues')
@@ -201,7 +203,7 @@ def plot_confusion_matrix(cm,
     fig = plt.figure(figsize=(5, 4))
     ax = plt.axes()
     plt.imshow(cm, interpolation='nearest', cmap=cmap)
-    plt.title(title)
+    plt.title(title + ' at epoch ' + str(epoch))
     plt.colorbar()
 
     if target_names is not None:
@@ -220,12 +222,16 @@ def plot_confusion_matrix(cm,
                      horizontalalignment="center",
                      color="white" if cm[i, j] > thresh else "black")
 
-
     plt.ylabel('True label')
     plt.xlim(-1, len(target_names))
     plt.ylim(-1, len(target_names))
-    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
+    plt.xlabel('Predicted label')
+    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
     plt.tight_layout()
+
+    plt.savefig(fname + '.png')
+    plt.close(fig)
+
     return fig, ax
 
 
@@ -244,7 +250,7 @@ def plot_E_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='tar
         [bins["E_val"][0], bins["E_val"][-1]],
         color="black", ls="--", lw=0.5)
     plt.savefig(osp.join(outpath,"energy_2d_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     plt.figure(figsize=(4,4))
     ax = plt.axes()
     plt.hist(v0[msk_true, 0], bins=bins["E_val"], density=1.0, histtype="step", lw=2, label=bins["true_val"]);
@@ -257,7 +263,7 @@ def plot_E_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='tar
     particle_label(ax, pid)
     ax.set_ylim(ax.get_ylim()[0], 1.5*ax.get_ylim()[1])
     plt.savefig(osp.join(outpath,"energy_hist_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     ax.set_ylim(ax.get_ylim()[0], 1.2*ax.get_ylim()[1])
 
     res = (v0[msk_both, 1] - v0[msk_both, 0])/v0[msk_both, 0]
@@ -273,7 +279,7 @@ def plot_E_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='tar
     sample_label(ax)
     particle_label(ax, pid)
     plt.savefig(osp.join(outpath,"energy_ratio_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     #efficiency vs fake rate
     plt.figure(figsize=(4,4))
     ax = plt.axes()
@@ -328,7 +334,7 @@ def plot_eta_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
         [bins["eta_val"][0], bins["eta_val"][-1]],
         color="black", ls="--", lw=0.5)
     plt.savefig(osp.join(outpath,"eta_2d_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     plt.figure(figsize=(4,4))
     ax = plt.axes()
     plt.hist(v0[msk_true, 0], bins=bins["eta_val"], density=1.0, histtype="step", lw=2, label=bins["true_val"]);
@@ -341,7 +347,7 @@ def plot_eta_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
     particle_label(ax, pid)
     ax.set_ylim(ax.get_ylim()[0], 1.5*ax.get_ylim()[1])
     plt.savefig(osp.join(outpath,"eta_hist_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     ax.set_ylim(ax.get_ylim()[0], 1.2*ax.get_ylim()[1])
 
     res = (v0[msk_both, 1] - v0[msk_both, 0])
@@ -357,7 +363,7 @@ def plot_eta_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
     sample_label(ax)
     particle_label(ax, pid)
     plt.savefig(osp.join(outpath,"eta_ratio_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     #efficiency vs fake rate
     plt.figure(figsize=(4,4))
     ax = plt.axes()
@@ -412,7 +418,7 @@ def plot_phi_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
         [bins["phi_val"][0], bins["phi_val"][-1]],
         color="black", ls="--", lw=0.5)
     plt.savefig(osp.join(outpath,"phi_2d_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     plt.figure(figsize=(4,4))
     ax = plt.axes()
     plt.hist(v0[msk_true, 0], bins=bins["phi_val"], density=1.0, histtype="step", lw=2, label=bins["true_val"]);
@@ -439,7 +445,7 @@ def plot_phi_reso(big_df, pid, v0, msk_true, msk_pred, msk_both, bins, target='t
     sample_label(ax)
     particle_label(ax, pid)
     plt.savefig(osp.join(outpath,"phi_ratio_pid{}.pdf".format(pid)), bbox_inches="tight")
-    
+
     #efficiency vs fake rate
     plt.figure(figsize=(4,4))
     ax = plt.axes()

From c5284efc82ae1023b9c8dbee06d0f2626e91be23 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 23 Jul 2021 12:09:49 +0200
Subject: [PATCH 005/157] feat: keras-tuner chief and tuner scripts

Run the chief script on the chief node and run the
tuner script once on each worker node in order to perform
a distributed hyperparameter search with keras-tuner. You
need to pass appropriate IPs and ports to each script.
---
 mlpf/hypertune_scripts/run_chief.sh | 17 +++++++++++++++++
 mlpf/hypertune_scripts/run_tuner.sh | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100755 mlpf/hypertune_scripts/run_chief.sh
 create mode 100755 mlpf/hypertune_scripts/run_tuner.sh

diff --git a/mlpf/hypertune_scripts/run_chief.sh b/mlpf/hypertune_scripts/run_chief.sh
new file mode 100755
index 000000000..cbc040cb7
--- /dev/null
+++ b/mlpf/hypertune_scripts/run_chief.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+export KERASTUNER_TUNER_ID=$1
+export KERASTUNER_ORACLE_IP=$2
+export KERASTUNER_ORACLE_PORT=$3
+echo "KERASTUNER_TUNER_ID:"
+echo $KERASTUNER_TUNER_ID
+echo "KERASTUNER_ORACLE_IP:"
+echo $KERASTUNER_ORACLE_IP
+echo "KERASTUNER_ORACLE_PORT:"
+echo $KERASTUNER_ORACLE_PORT
+
+
+nvidia-smi
+echo 'Starting chief.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c parameters/cms-gnn-dense-short.yaml -o $4
+echo 'Chief done.'
\ No newline at end of file
diff --git a/mlpf/hypertune_scripts/run_tuner.sh b/mlpf/hypertune_scripts/run_tuner.sh
new file mode 100755
index 000000000..663e2902c
--- /dev/null
+++ b/mlpf/hypertune_scripts/run_tuner.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+export KERASTUNER_TUNER_ID=$1
+export KERASTUNER_ORACLE_IP=$2
+export KERASTUNER_ORACLE_PORT=$3
+echo "KERASTUNER_TUNER_ID:"
+echo $KERASTUNER_TUNER_ID
+echo "KERASTUNER_ORACLE_IP:"
+echo $KERASTUNER_ORACLE_IP
+echo "KERASTUNER_ORACLE_PORT:"
+echo $KERASTUNER_ORACLE_PORT
+
+
+nvidia-smi
+echo 'Starting tuner.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c parameters/cms-gnn-dense-short.yaml -o $4
+echo 'Tuner done.'

From 2806067343b8e2e2541d4c2ab947b4c6429d7f26 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 23 Jul 2021 12:14:39 +0200
Subject: [PATCH 006/157] feat: Distributed training on Flatiron Institute HPC
 site

Slurm scripts to perform distributed training, hyperparameter
optimization and more on Flatiron Institute's HPC system.
---
 mlpf/flatiron/find_lr_4GPUs.slurm          | 45 +++++++++++++
 mlpf/flatiron/hypertune.slurm              | 73 ++++++++++++++++++++++
 mlpf/flatiron/pipeline_evaluate_1GPU.slurm | 36 +++++++++++
 mlpf/flatiron/pipeline_train_4GPUs.slurm   | 51 +++++++++++++++
 mlpf/flatiron/train_4GPUs.slurm            | 50 +++++++++++++++
 mlpf/flatiron/validate_4GPUs.slurm         | 50 +++++++++++++++
 6 files changed, 305 insertions(+)
 create mode 100644 mlpf/flatiron/find_lr_4GPUs.slurm
 create mode 100644 mlpf/flatiron/hypertune.slurm
 create mode 100644 mlpf/flatiron/pipeline_evaluate_1GPU.slurm
 create mode 100644 mlpf/flatiron/pipeline_train_4GPUs.slurm
 create mode 100644 mlpf/flatiron/train_4GPUs.slurm
 create mode 100644 mlpf/flatiron/validate_4GPUs.slurm

diff --git a/mlpf/flatiron/find_lr_4GPUs.slurm b/mlpf/flatiron/find_lr_4GPUs.slurm
new file mode 100644
index 000000000..ebf1bb569
--- /dev/null
+++ b/mlpf/flatiron/find_lr_4GPUs.slurm
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+#SBATCH -t 1:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J find_lr
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py find-lr -c $1
+
+cp lr_finder.jpg $SLURM_SUBMIT_DIR/
diff --git a/mlpf/flatiron/hypertune.slurm b/mlpf/flatiron/hypertune.slurm
new file mode 100644
index 000000000..9ff411e7c
--- /dev/null
+++ b/mlpf/flatiron/hypertune.slurm
@@ -0,0 +1,73 @@
+#!/bin/sh
+
+#SBATCH -t 168:00:00
+#SBATCH -N 4
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --constraint=a100,sxm4
+#SBATCH --gpus-per-task=4
+
+# Job name
+#SBATCH -J hypertune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+# Getting the node hostnames
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+echo $nodes
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port "/mnt/ceph/users/ewulff/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+sleep 5
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    tunerID="tuner$i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port "/mnt/ceph/users/ewulff/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+    sleep 1
+done
+wait # keep the wait statement, it is important
+echo "Done."
diff --git a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
new file mode 100644
index 000000000..d3b2e6978
--- /dev/null
+++ b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 03:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 1
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J pipeeval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+python3 tf_list_gpus.py
+
+echo 'Starting evaluation.'
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
+echo 'Evaluation done.'
diff --git a/mlpf/flatiron/pipeline_train_4GPUs.slurm b/mlpf/flatiron/pipeline_train_4GPUs.slurm
new file mode 100644
index 000000000..cbdb21308
--- /dev/null
+++ b/mlpf/flatiron/pipeline_train_4GPUs.slurm
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 168:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus-per-task=4
+#SBATCH --constraint=a100,sxm4
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+mkdir experiments
+
+python3 tf_list_gpus.py
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2
+echo 'Training done.'
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
diff --git a/mlpf/flatiron/train_4GPUs.slurm b/mlpf/flatiron/train_4GPUs.slurm
new file mode 100644
index 000000000..6b9cfc3e3
--- /dev/null
+++ b/mlpf/flatiron/train_4GPUs.slurm
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 72:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J train
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1
+echo 'Training done.'
+ls -l experiments
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
diff --git a/mlpf/flatiron/validate_4GPUs.slurm b/mlpf/flatiron/validate_4GPUs.slurm
new file mode 100644
index 000000000..566a2d8a9
--- /dev/null
+++ b/mlpf/flatiron/validate_4GPUs.slurm
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 8:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+
+# Job name
+#SBATCH -J eval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude=".git" . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+echo 'Starting validation.'
+#Run the validation to produce the predictions file
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action eval --model-spec parameters/delphes-gnn-skipconn.yaml --weights $1
+echo 'Valdiation done.'
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/

From de055229741fb6d1ae5a7cd9cb0af50d17fed84b Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 23 Jul 2021 17:16:56 +0200
Subject: [PATCH 007/157] feat: Distributed training on JUWELS Booster
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Slurm scripts to perform distributed training, hyperparameter
optimization and more on the JUWELS Booster in
Jülich Supercomputing Centre at Forschungszentrum Jülich.
---
 mlpf/juwels/hypertune.slurm         | 79 +++++++++++++++++++++++++++++
 mlpf/juwels/pipeline_evaluate.slurm | 40 +++++++++++++++
 mlpf/juwels/pipeline_train.slurm    | 52 +++++++++++++++++++
 mlpf/juwels/train_mlpf.slurm        | 52 +++++++++++++++++++
 4 files changed, 223 insertions(+)
 create mode 100644 mlpf/juwels/hypertune.slurm
 create mode 100644 mlpf/juwels/pipeline_evaluate.slurm
 create mode 100644 mlpf/juwels/pipeline_train.slurm
 create mode 100644 mlpf/juwels/train_mlpf.slurm

diff --git a/mlpf/juwels/hypertune.slurm b/mlpf/juwels/hypertune.slurm
new file mode 100644
index 000000000..723bbd953
--- /dev/null
+++ b/mlpf/juwels/hypertune.slurm
@@ -0,0 +1,79 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 0:59:59
+#SBATCH --nodes 4
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+
+# Job name
+#SBATCH -J hypertune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+jutil env activate -p prcoe12
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+
+# Getting the node hostnames
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+echo "Using nodes:"
+echo $nodes
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port "/p/project/prcoe12/wulff1/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+sleep 5
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    tunerID="tuner$i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port "/p/project/prcoe12/wulff1/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+    sleep 1
+done
+wait # keep the wait statement, it is important!
+echo "Done."
diff --git a/mlpf/juwels/pipeline_evaluate.slurm b/mlpf/juwels/pipeline_evaluate.slurm
new file mode 100644
index 000000000..fd2beb45b
--- /dev/null
+++ b/mlpf/juwels/pipeline_evaluate.slurm
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 01:59:59
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+# Job name
+#SBATCH -J pipeeval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0
+
+jutil env activate -p prcoe12
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+python3 tf_list_gpus.py
+echo 'Starting evaluation.'
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
+echo 'Evaluation done.'
+
+rsync -a experiments/ $SLURM_SUBMIT_DIR/experiments/
diff --git a/mlpf/juwels/pipeline_train.slurm b/mlpf/juwels/pipeline_train.slurm
new file mode 100644
index 000000000..7482264ca
--- /dev/null
+++ b/mlpf/juwels/pipeline_train.slurm
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 23:59:59
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+jutil env activate -p prcoe12
+
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+mkdir $SCRATCH/particleflow
+rsync -ar --exclude={".git","experiments"} . $SCRATCH/particleflow/
+cd $SCRATCH/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+echo 'Starting training.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2
+echo 'Training done.'
+
+rsync -a experiments/ $SLURM_SUBMIT_DIR/experiments/
diff --git a/mlpf/juwels/train_mlpf.slurm b/mlpf/juwels/train_mlpf.slurm
new file mode 100644
index 000000000..bdd06723c
--- /dev/null
+++ b/mlpf/juwels/train_mlpf.slurm
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+#SBATCH --account=prcoe12
+#SBATCH --partition=booster
+#SBATCH --time 23:59:59
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node=1
+#SBATCH --gres=gpu:4
+
+# Job name
+#SBATCH -J mlpf_tr
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load GCC/10.3.0 CUDA/11.0 cuDNN/8.0.2.39-CUDA-11.0
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+jutil env activate -p prcoe12
+
+nvidia-smi
+
+source /p/project/prcoe12/wulff1/miniconda3/bin/activate tf2
+echo "Python used:"
+which python3
+python3 --version
+
+mkdir $SCRATCH/particleflow
+rsync -ar --exclude=".git" . $SCRATCH/particleflow/
+cd $SCRATCH/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+echo 'Starting training.'
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1
+echo 'Training done.'
+
+rsync -a experiments/ $SLURM_SUBMIT_DIR/experiments/

From f7b6bae780da6d583ca31062bd49e668b2e42f11 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 23 Jul 2021 10:08:00 -0700
Subject: [PATCH 008/157] small bug

---
 mlpf/pytorch_delphes/LRP/main_reg.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index 16a17ab63..50ecbbb64 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -131,7 +131,7 @@ def map_index_to_p4(index):
     # 'LRP_outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
     # 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
     # 'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'explain': True, 'make_heatmaps_clf': True,'make_heatmaps_reg': True,
+    # 'explain': False, 'make_heatmaps_clf': True,'make_heatmaps_reg': True,
     # 'clf': True, 'reg': True})
 
     # define the dataset (assumes the data exists as .pt files in "processed")
@@ -323,7 +323,7 @@ def get_type(t):
                                 l.append('track')
                         return l
 
-                    node_types = get_type(R_cat_feat[:,12])
+                    node_types = get_type(R_cat_feat_msk[:,12])
 
                     fig, ax = plt.subplots()
                     fig.tight_layout()
@@ -426,7 +426,7 @@ def get_type(t):
                                 l.append('track')
                         return l
 
-                    node_types = get_type(R_cat_feat[:,12])
+                    node_types = get_type(R_cat_feat_msk[:,12])
 
                     fig, ax = plt.subplots()
                     fig.tight_layout()

From 8ec5c363047752f9bc13f3b6d2c3d9a40588e4fd Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 23 Jul 2021 10:11:08 -0700
Subject: [PATCH 009/157] fix pic scales

---
 mlpf/pytorch_delphes/LRP/main_reg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index 50ecbbb64..0e26d4f8e 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -351,7 +351,7 @@ def get_type(t):
                     plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
                     plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
                     plt.colorbar()
-                    fig.set_size_inches(10, 10)
+                    fig.set_size_inches(12, 12)
                     plt.savefig(outpath + f'/class{str(pid)}'+f'/p4_elem{str(p4_elem)}'+f'/sample{str(node_i)}.jpg')
                     plt.close(fig)
 
@@ -454,7 +454,7 @@ def get_type(t):
                     plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
                     plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
                     plt.colorbar()
-                    fig.set_size_inches(10, 10)
+                    fig.set_size_inches(12, 12)
                     plt.savefig(outpath + f'/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
                     plt.close(fig)
 

From 3eabb82ac09adcba654e121092460216c41657fd Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 23 Jul 2021 11:01:30 -0700
Subject: [PATCH 010/157] plotting cosmetics

---
 mlpf/pytorch_delphes/evaluate.py | 60 +++++++++++++++++---------------
 mlpf/pytorch_delphes/plots.py    | 38 +++++++++++---------
 mlpf/pytorch_delphes/training.py |  6 ++--
 3 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/mlpf/pytorch_delphes/evaluate.py b/mlpf/pytorch_delphes/evaluate.py
index 1b0ba25ef..ba59c0335 100644
--- a/mlpf/pytorch_delphes/evaluate.py
+++ b/mlpf/pytorch_delphes/evaluate.py
@@ -181,23 +181,25 @@ def make_plots(model, test_loader, outpath, target, device, epoch, which_data):
     torch.save(conf_matrix_cand, outpath + '/conf_matrix_cand' + str(epoch) + '.pt')
 
     # making all the other plots
-    sample_title_qcd = "QCD, 14 TeV, PU200"
-    sample_title_ttbar = "$t\\bar{t}$, 14 TeV, PU200"
+    if 'test' in which_data:
+        sample = "QCD, 14 TeV, PU200"
+    else:
+        sample = "$t\\bar{t}$, 14 TeV, PU200"
 
     # make distribution plots
     plot_distributions_pid(1, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for chhadrons
-                target, epoch, outpath)
+                target, epoch, outpath, legend_title=sample+"\n")
     plot_distributions_pid(2, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for nhadrons
-                target, epoch, outpath)
+                target, epoch, outpath, legend_title=sample+"\n")
     plot_distributions_pid(3, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for photons
-                target, epoch, outpath)
+                target, epoch, outpath, legend_title=sample+"\n")
     plot_distributions_pid(4, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for electrons
-                target, epoch, outpath)
+                target, epoch, outpath, legend_title=sample+"\n")
     plot_distributions_pid(5, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for muons
-                target, epoch, outpath)
+                target, epoch, outpath, legend_title=sample+"\n")
 
     plot_distributions_all(gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for all together
-                target, epoch, outpath)
+                target, epoch, outpath, legend_title=sample+"\n")
 
     # make pt, eta plots to visualize dataset
     ax, _ = plot_pt_eta(ygen)
@@ -235,106 +237,106 @@ def make_plots(model, test_loader, outpath, target, device, epoch, which_data):
     plt.close(fig)
 
     # make efficiency and fake rate plots for charged hadrons
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_pt.png", both=True, legend_title=sample_title_qcd+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_eta.png", both=True, legend_title=sample_title_qcd+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid1_energy.png", both=True, legend_title=sample_title_qcd+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_pt.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_eta.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid1_energy.png", both=True, legend_title=sample+"\n")
 
     # make efficiency and fake rate plots for neutral hadrons
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_pt.png", both=True, legend_title=sample_title_qcd+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_eta.png", both=True, legend_title=sample_title_qcd+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid2_energy.png", both=True, legend_title=sample_title_qcd+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_pt.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_eta.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid2_energy.png", both=True, legend_title=sample+"\n")
 
     # make resolution plots for chhadrons: pid=1
     fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_chhad_pt = plot_reso(ygen, ypred, ycand, 1, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    res_chhad_pt = plot_reso(ygen, ypred, ycand, 1, "pt", 2, ax=ax1, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid1_pt.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_chhad_eta = plot_reso(ygen, ypred, ycand, 1, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    res_chhad_eta = plot_reso(ygen, ypred, ycand, 1, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid1_eta.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_chhad_E = plot_reso(ygen, ypred, ycand, 1, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    res_chhad_E = plot_reso(ygen, ypred, ycand, 1, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid1_energy.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     # make resolution plots for nhadrons: pid=2
     fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_nhad_pt = plot_reso(ygen, ypred, ycand, 2, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    res_nhad_pt = plot_reso(ygen, ypred, ycand, 2, "pt", 2, ax=ax1, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid2_pt.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_nhad_eta = plot_reso(ygen, ypred, ycand, 2, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    res_nhad_eta = plot_reso(ygen, ypred, ycand, 2, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid2_eta.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_nhad_E = plot_reso(ygen, ypred, ycand, 2, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    res_nhad_E = plot_reso(ygen, ypred, ycand, 2, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid2_energy.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     # make resolution plots for photons: pid=3
     fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_photon_pt = plot_reso(ygen, ypred, ycand, 3, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    res_photon_pt = plot_reso(ygen, ypred, ycand, 3, "pt", 2, ax=ax1, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid3_pt.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_photon_eta = plot_reso(ygen, ypred, ycand, 3, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    res_photon_eta = plot_reso(ygen, ypred, ycand, 3, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid3_eta.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_photon_E = plot_reso(ygen, ypred, ycand, 3, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    res_photon_E = plot_reso(ygen, ypred, ycand, 3, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid3_energy.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     # make resolution plots for electrons: pid=4
     fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_electron_pt = plot_reso(ygen, ypred, ycand, 4, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    res_electron_pt = plot_reso(ygen, ypred, ycand, 4, "pt", 2, ax=ax1, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid4_pt.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_electron_eta = plot_reso(ygen, ypred, ycand, 4, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    res_electron_eta = plot_reso(ygen, ypred, ycand, 4, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid4_eta.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_electron_E = plot_reso(ygen, ypred, ycand, 4, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    res_electron_E = plot_reso(ygen, ypred, ycand, 4, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid4_energy.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     # make resolution plots for muons: pid=5
     fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_muon_pt = plot_reso(ygen, ypred, ycand, 5, "pt", 2, ax=ax1, legend_title=sample_title_qcd+"\n")
+    res_muon_pt = plot_reso(ygen, ypred, ycand, 5, "pt", 2, ax=ax1, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid5_pt.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_muon_eta = plot_reso(ygen, ypred, ycand, 5, "eta", 0.2, ax=ax2, legend_title=sample_title_qcd+"\n")
+    res_muon_eta = plot_reso(ygen, ypred, ycand, 5, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid5_eta.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
 
     fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_muon_E = plot_reso(ygen, ypred, ycand, 5, "energy", 0.2, ax=ax3, legend_title=sample_title_qcd+"\n")
+    res_muon_E = plot_reso(ygen, ypred, ycand, 5, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
     plt.savefig(outpath+"/resolution_plots/res_pid5_energy.png", bbox_inches="tight")
     plt.tight_layout()
     plt.close(fig)
diff --git a/mlpf/pytorch_delphes/plots.py b/mlpf/pytorch_delphes/plots.py
index 82677234d..56e4c55e3 100644
--- a/mlpf/pytorch_delphes/plots.py
+++ b/mlpf/pytorch_delphes/plots.py
@@ -166,7 +166,7 @@ def plot_particles(fname, true_id, true_p4, pred_id, pred_p4, pid=1):
 
     return fig
 
-def plot_distribution(val_x, val_y, var_name, rng, target, fname):
+def plot_distribution(pid, val_x, val_y, var_name, rng, target, fname, legend_title=""):
     plt.style.use(mplhep.style.CMS)
 
     fig = plt.figure(figsize=(10,10))
@@ -178,15 +178,19 @@ def plot_distribution(val_x, val_y, var_name, rng, target, fname):
 
     plt.hist(val_y, bins=rng, density=True, histtype="step", lw=2, label="MLPF");
     plt.xlabel(var_name)
-    plt.legend(loc="best", frameon=False)
-    plt.ylim(0,1.5)
 
+    if pid!=-1:
+        plt.legend(frameon=False, title=legend_title+pid_names[pid])
+    else:
+        plt.legend(frameon=False, title=legend_title)
+
+    plt.ylim(0,1.5)
     plt.savefig(fname + '.png')
     plt.close(fig)
 
     return fig
 
-def plot_distributions_pid(pid, true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath):
+def plot_distributions_pid(pid, true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath, legend_title=""):
     plt.style.use("default")
 
     ch_true = true_p4[true_id==pid, 0].flatten().detach().cpu().numpy()
@@ -207,14 +211,14 @@ def plot_distributions_pid(pid, true_id, true_p4, pred_id, pred_p4, pf_id, cand_
     e_true = true_p4[true_id==pid, 5].flatten().detach().cpu().numpy()
     e_pred = pred_p4[pred_id==pid, 5].flatten().detach().cpu().numpy()
 
-    figure = plot_distribution(ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_charge_distribution')
-    figure = plot_distribution(pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_pt_distribution')
-    figure = plot_distribution(e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_energy_distribution')
-    figure = plot_distribution(eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_eta_distribution')
-    figure = plot_distribution(sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_sphi_distribution')
-    figure = plot_distribution(cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_cphi_distribution')
+    figure = plot_distribution(pid, ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_charge_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_pt_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_energy_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_eta_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_sphi_distribution', legend_title=legend_title)
+    figure = plot_distribution(pid, cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/' + pid_names[pid] + '_cphi_distribution', legend_title=legend_title)
 
-def plot_distributions_all(true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath):
+def plot_distributions_all(true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, target, epoch, outpath, legend_title=""):
     plt.style.use("default")
 
     msk = (pred_id!=0) & (true_id!=0)
@@ -237,12 +241,12 @@ def plot_distributions_all(true_id, true_p4, pred_id, pred_p4, pf_id, cand_p4, t
     e_true = true_p4[msk, 5].flatten().detach().cpu().numpy()
     e_pred = pred_p4[msk, 5].flatten().detach().cpu().numpy()
 
-    figure = plot_distribution(ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_charge_distribution')
-    figure = plot_distribution(pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_pt_distribution')
-    figure = plot_distribution(e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/all_energy_distribution')
-    figure = plot_distribution(eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/all_eta_distribution')
-    figure = plot_distribution(sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_sphi_distribution')
-    figure = plot_distribution(cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_cphi_distribution')
+    figure = plot_distribution(-1, ch_true, ch_pred, "charge", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_charge_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, pt_true, pt_pred, "pt", np.linspace(0, 5, 100), target, fname = outpath+'/distribution_plots/all_pt_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, e_true, e_pred, "E", np.linspace(-1, 5, 100), target, fname = outpath+'/distribution_plots/all_energy_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, eta_true, eta_pred, "eta", np.linspace(-5, 5, 100), target, fname = outpath+'/distribution_plots/all_eta_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, sphi_true, sphi_pred, "sin phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_sphi_distribution', legend_title=legend_title)
+    figure = plot_distribution(-1, cphi_true, cphi_pred, "cos phi", np.linspace(-2, 2, 100), target, fname = outpath+'/distribution_plots/all_cphi_distribution', legend_title=legend_title)
 
 def midpoints(x):
     return x[:-1] + np.diff(x)/2
diff --git a/mlpf/pytorch_delphes/training.py b/mlpf/pytorch_delphes/training.py
index 3ebee8f30..b0eb9dcc2 100644
--- a/mlpf/pytorch_delphes/training.py
+++ b/mlpf/pytorch_delphes/training.py
@@ -308,13 +308,13 @@ def train_loop():
     #     def __init__(self, d):
     #         self.__dict__ = d
     #
-    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
+    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
     # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
     # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
     # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
-    # 'load': False, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'load': True, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
     # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
-    # 'make_predictions_train': False, 'make_plots_train': False, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True})
+    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True})
 
     # define the dataset (assumes the data exists as .pt files in "processed")
     print('Processing the data..')

From 84212c92bfdff9e6b96617601d8f312055246fe3 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 23 Jul 2021 11:09:26 -0700
Subject: [PATCH 011/157] typos

---
 mlpf/pytorch_delphes/LRP/main_reg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index 0e26d4f8e..e6e72c79e 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -371,8 +371,8 @@ def get_type(t):
 
         # make directories to hold the heatmaps
         for i in range(6):
-            if not osp.isdir(outpath + f'/clf_class{str(i)}'):
-                os.makedirs(outpath + f'/clfclass{str(i)}')
+            if not osp.isdir(outpath + f'/class{str(i)}'):
+                os.makedirs(outpath + f'/class{str(i)}')
             for j in range(6):
                 if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
                     os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')

From 1a6a909966ae75da2ce6016e10006e0485a94173 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 23 Jul 2021 11:31:14 -0700
Subject: [PATCH 012/157] one function to plot all heatmaps

---
 mlpf/pytorch_delphes/LRP/main_reg.py | 315 ++++++++++-----------------
 1 file changed, 121 insertions(+), 194 deletions(-)

diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index e6e72c79e..8af48e3dc 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -117,6 +117,125 @@ def map_index_to_p4(index):
     if index==5:
         return 'energy'
 
+def make_heatmaps(big_list, to_explain, task):
+
+    print(f'Making heatmaps for {task}..')
+
+    X = to_explain["inputs"]
+    gen_ids_one_hot = to_explain["gen_id"]
+    pred_ids_one_hot = to_explain["pred_id"]
+
+    gen_ids = gen_ids_one_hot.argmax(axis=1)
+    pred_ids = pred_ids_one_hot.argmax(axis=1)
+
+    # make directories to hold the heatmaps
+    for i in range(6):
+        if not osp.isdir(outpath + f'/class{str(i)}'):
+            os.makedirs(outpath + f'/class{str(i)}')
+        for j in range(6):
+            if task=='regression':
+                if not osp.isdir(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}'):
+                    os.makedirs(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}')
+            elif task=='classification':
+                if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')
+
+    # attempt to break down big_list onto 6 smaller lists, 1 for each pid
+    list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
+    dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
+
+    for node_i in range(len(big_list)):  # iterate over the nodes
+
+        if gen_ids[node_i]==0:  # if it's a null then add it to the null list
+            list0.append(big_list[node_i])
+            dist0.append(node_i)
+        if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
+            list1.append(big_list[node_i])
+            dist1.append(node_i)
+        if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
+            list2.append(big_list[node_i])
+            dist2.append(node_i)
+        if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
+            list3.append(big_list[node_i])
+            dist3.append(node_i)
+        if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
+            list4.append(big_list[node_i])
+            dist4.append(node_i)
+        if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
+            list5.append(big_list[node_i])
+            dist5.append(node_i)
+
+    list = [list0,list1,list2,list3,list4,list5]
+    dist = [dist0,dist1,dist2,dist3,dist4,dist5]
+
+    if task=='regression':
+        output_dim = output_dim_p4
+    elif task=='classification':
+        output_dim = output_dim_id
+
+    for pid in range(output_dim_id):
+        for node_i in range(len(list[pid])): # iterate over the nodes in each list
+            print('- making heatmap for', map_index_to_pid(pid), 'node #:', node_i+1, '/', len(list[pid]))
+            for output_neuron in range(output_dim):
+                R_cat_feat = torch.cat([list[pid][node_i][output_neuron].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
+
+                non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
+                R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
+                pos = dist[pid][node_i]
+                probability = pred_ids_one_hot[pos]
+
+                def get_type(t):
+                    l = []
+                    for elem in t:
+                        if elem==1:
+                            l.append('cluster')
+                        if elem==2:
+                            l.append('track')
+                    return l
+
+                node_types = get_type(R_cat_feat_msk[:,12])
+
+                fig, ax = plt.subplots()
+                fig.tight_layout()
+
+                if task=='regression':
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                elif task=='classification':
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
+                if pid==1:
+                    features = ["type", " pt", "eta",
+                           "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
+                else:
+                    features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
+
+                ax.set_xticks(np.arange(len(features)))
+                ax.set_yticks(np.arange(len(node_types)))
+                for col in range(len(features)):
+                    for row in range(len(node_types)):
+                        text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
+                                       ha="center", va="center", color="w")
+                # ... and label them with the respective list entries
+                ax.set_xticklabels(features)
+                ax.set_yticklabels(node_types)
+                plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
+                plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
+                plt.colorbar()
+                fig.set_size_inches(12, 12)
+                if task=='regression':
+                    plt.savefig(outpath + f'/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                elif task=='classification':
+                    plt.savefig(outpath + f'/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                plt.close(fig)
+
 if __name__ == "__main__":
 
     args = parse_args()
@@ -253,210 +372,18 @@ def hook(model, input, output):
             break # explain only one single event
 
     if args.make_heatmaps_reg:
-        print('Making heatmaps for regression..')
-
         # load the necessary R-scores
         big_list_reg = torch.load(outpath + f'/big_list_reg.pt', map_location=device)
         to_explain_reg = torch.load(outpath + f'/to_explain_reg.pt', map_location=device)
 
-        X = to_explain_reg["inputs"]
-        gen_ids_one_hot = to_explain_reg["gen_id"]
-        pred_ids_one_hot = to_explain_reg["pred_id"]
-
-        gen_ids = gen_ids_one_hot.argmax(axis=1)
-        pred_ids = pred_ids_one_hot.argmax(axis=1)
-
-        # make directories to hold the heatmaps
-        for i in range(6):
-            if not osp.isdir(outpath + f'/class{str(i)}'):
-                os.makedirs(outpath + f'/class{str(i)}')
-            for j in range(6):
-                if not osp.isdir(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}'):
-                    os.makedirs(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}')
-
-        # attempt to break down big_list onto 6 smaller lists, 1 for each pid
-        list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
-        dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
-
-        for node_i in range(len(big_list_reg)):  # iterate over the nodes
-
-            if gen_ids[node_i]==0:  # if it's a null then add it to the null list
-                list0.append(big_list_reg[node_i])
-                dist0.append(node_i)
-            if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
-                list1.append(big_list_reg[node_i])
-                dist1.append(node_i)
-            if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
-                list2.append(big_list_reg[node_i])
-                dist2.append(node_i)
-            if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
-                list3.append(big_list_reg[node_i])
-                dist3.append(node_i)
-            if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
-                list4.append(big_list_reg[node_i])
-                dist4.append(node_i)
-            if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
-                list5.append(big_list_reg[node_i])
-                dist5.append(node_i)
-
-        list = [list0,list1,list2,list3,list4,list5]
-        dist = [dist0,dist1,dist2,dist3,dist4,dist5]
-
-        for pid in range(output_dim_id):
-
-            for node_i in range(len(list[pid])): # iterate over the nodes in each list
-
-                for p4_elem in range(output_dim_p4):
-                    R_cat_feat = torch.cat([list[pid][node_i][p4_elem].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
-
-                    non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
-                    R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
-                    pos = dist[pid][node_i]
-                    probability = pred_ids_one_hot[pos]
-
-                    def get_type(t):
-                        l = []
-                        for elem in t:
-                            if elem==1:
-                                l.append('cluster')
-                            if elem==2:
-                                l.append('track')
-                        return l
-
-                    node_types = get_type(R_cat_feat_msk[:,12])
-
-                    fig, ax = plt.subplots()
-                    fig.tight_layout()
-                    if (torch.argmax(probability)==pid):
-                        ax.set_title('Heatmap for the "'+map_index_to_p4(p4_elem)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
-                    else:
-                        ax.set_title('Heatmap for the "'+map_index_to_p4(p4_elem)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
-
-                    ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
-                    if pid==1:
-                        features = ["type", " pt", "eta",
-                               "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
-                    else:
-                        features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
-
-                    ax.set_xticks(np.arange(len(features)))
-                    ax.set_yticks(np.arange(len(node_types)))
-                    for col in range(len(features)):
-                        for row in range(len(node_types)):
-                            text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
-                                           ha="center", va="center", color="w")
-                    # ... and label them with the respective list entries
-                    ax.set_xticklabels(features)
-                    ax.set_yticklabels(node_types)
-                    plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
-                    plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
-                    plt.colorbar()
-                    fig.set_size_inches(12, 12)
-                    plt.savefig(outpath + f'/class{str(pid)}'+f'/p4_elem{str(p4_elem)}'+f'/sample{str(node_i)}.jpg')
-                    plt.close(fig)
+        make_heatmaps(big_list_reg, to_explain_reg, 'regression')
 
     if args.make_heatmaps_clf:
-        print('Making heatmaps for regression..')
-
         # load the necessary R-scores
         big_list_clf = torch.load(outpath + f'/big_list_clf.pt', map_location=device)
         to_explain_clf = torch.load(outpath + f'/to_explain_clf.pt', map_location=device)
 
-        X = to_explain_clf["inputs"]
-        gen_ids_one_hot = to_explain_clf["gen_id"]
-        pred_ids_one_hot = to_explain_clf["pred_id"]
-
-        gen_ids = gen_ids_one_hot.argmax(axis=1)
-        pred_ids = pred_ids_one_hot.argmax(axis=1)
-
-        # make directories to hold the heatmaps
-        for i in range(6):
-            if not osp.isdir(outpath + f'/class{str(i)}'):
-                os.makedirs(outpath + f'/class{str(i)}')
-            for j in range(6):
-                if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
-                    os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')
-
-        # attempt to break down big_list onto 6 smaller lists, 1 for each pid
-        list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
-        dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
-
-        for node_i in range(len(big_list_clf)):  # iterate over the nodes
-
-            if gen_ids[node_i]==0:  # if it's a null then add it to the null list
-                list0.append(big_list_clf[node_i])
-                dist0.append(node_i)
-            if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
-                list1.append(big_list_clf[node_i])
-                dist1.append(node_i)
-            if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
-                list2.append(big_list_clf[node_i])
-                dist2.append(node_i)
-            if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
-                list3.append(big_list_clf[node_i])
-                dist3.append(node_i)
-            if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
-                list4.append(big_list_clf[node_i])
-                dist4.append(node_i)
-            if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
-                list5.append(big_list_clf[node_i])
-                dist5.append(node_i)
-
-        list = [list0,list1,list2,list3,list4,list5]
-        dist = [dist0,dist1,dist2,dist3,dist4,dist5]
-
-        for pid in range(output_dim_id):
-
-            for node_i in range(len(list[pid])): # iterate over the nodes in each list
-
-                for output_neuron in range(output_dim_id):
-                    R_cat_feat = torch.cat([list[pid][node_i][output_neuron].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
-
-                    non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
-                    R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
-                    pos = dist[pid][node_i]
-                    probability = pred_ids_one_hot[pos]
-
-                    def get_type(t):
-                        l = []
-                        for elem in t:
-                            if elem==1:
-                                l.append('cluster')
-                            if elem==2:
-                                l.append('track')
-                        return l
-
-                    node_types = get_type(R_cat_feat_msk[:,12])
-
-                    fig, ax = plt.subplots()
-                    fig.tight_layout()
-                    if (torch.argmax(probability)==pid):
-                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
-                    else:
-                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
-
-                    ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
-                    if pid==1:
-                        features = ["type", " pt", "eta",
-                               "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
-                    else:
-                        features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
-
-                    ax.set_xticks(np.arange(len(features)))
-                    ax.set_yticks(np.arange(len(node_types)))
-                    for col in range(len(features)):
-                        for row in range(len(node_types)):
-                            text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
-                                           ha="center", va="center", color="w")
-                    # ... and label them with the respective list entries
-                    ax.set_xticklabels(features)
-                    ax.set_yticklabels(node_types)
-                    plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
-                    plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
-                    plt.colorbar()
-                    fig.set_size_inches(12, 12)
-                    plt.savefig(outpath + f'/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
-                    plt.close(fig)
+        make_heatmaps(big_list_clf, to_explain_clf, 'classification')
 
 # # ------------------------------------------------------------------------------------------------
 # # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:

From 8e9f80d6f4e630224c85b2eb089fcaf8ebda7ef9 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Fri, 23 Jul 2021 11:39:51 -0700
Subject: [PATCH 013/157] better path location

---
 mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py |  4 +--
 mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py | 12 ++++----
 mlpf/pytorch_delphes/LRP/main_reg.py    | 38 ++++++++++++++-----------
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py b/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
index 15d38cc13..68e63bd5e 100644
--- a/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
+++ b/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
@@ -173,7 +173,7 @@ def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weigh
             print("- Adjacency matrix is correctly computed")
 
         # # the following saves a version of the R-scores before the message passing
-        # torch.save(big_list, outpath + '/R_score_layer_before_msg_passing.pt')
+        # torch.save(big_list, outpath + '/LRP/R_score_layer_before_msg_passing.pt')
 
         # modify the big tensor based on message passing rule
         for node_i in tqdm(range(len(big_list))):
@@ -195,7 +195,7 @@ def explain(self, to_explain):
         print('Total number of layers (including activation layers):', start_index)
 
         # store the R-scores for the output layer (they are basically the model predictions)
-        torch.save(to_explain["pred_id"].detach(), outpath + f'/R_score_layer{start_index+1}.pt')
+        torch.save(to_explain["pred_id"].detach(), outpath + f'/LRP/R_score_layer{start_index+1}.pt')
 
         ### loop over each single layer
         big_list = []
diff --git a/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py b/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
index 89765a204..56f997298 100644
--- a/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
+++ b/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
@@ -192,7 +192,7 @@ def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weigh
             print("- Adjacency matrix is correctly computed")
 
         # # the following saves a version of the R-scores before the message passing
-        # torch.save(big_list, outpath + '/R_score_layer_before_msg_passing.pt')
+        # torch.save(big_list, outpath + '/LRP/R_score_layer_before_msg_passing.pt')
 
         # modify the big tensor based on message passing rule
         for node_i in tqdm(range(len(big_list))):
@@ -220,7 +220,7 @@ def explain(self,
         print('Total number of layers (including activation layers):', start_index)
 
         # store the R-scores for the output layer (they are basically the model predictions)
-        torch.save(to_explain["pred_p4"].detach(), outpath + f'/R_score_layer{start_index+1}.pt')
+        torch.save(to_explain["pred_p4"].detach(), outpath + f'/LRP/R_score_layer{start_index+1}.pt')
 
         ### loop over each single layer
         big_list = []
@@ -254,8 +254,8 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer} - Skip connection")
             input_relevance, pid_relevance, embedding_relevance = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True, skip_connection=True)
 
-            torch.save(input_relevance, outpath + f'/input_relevance.pt')
-            torch.save(embedding_relevance, outpath + f'/embedding_relevance.pt')
+            torch.save(input_relevance, outpath + f'/LRP/input_relevance.pt')
+            torch.save(embedding_relevance, outpath + f'/LRP/embedding_relevance.pt')
 
             return pid_relevance, big_list
 
@@ -265,7 +265,7 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
 
             # add the embedding_relevance computed in the nn3.0 skip connection
-            embedding_relevance = torch.load(outpath + f'/embedding_relevance.pt', map_location=torch.device('cpu'))
+            embedding_relevance = torch.load(outpath + f'/LRP/embedding_relevance.pt', map_location=torch.device('cpu'))
 
             for i in range(len(R)):
                 R[i] = R[i] + embedding_relevance[i]
@@ -277,7 +277,7 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
 
             # add the input_relevance computed in the nn3.0 skip connection
-            input_relevance = torch.load(outpath + f'/input_relevance.pt', map_location=torch.device('cpu'))
+            input_relevance = torch.load(outpath + f'/LRP/input_relevance.pt', map_location=torch.device('cpu'))
 
             for node_i in tqdm(range(len(big_list))):
                 big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index 8af48e3dc..d37aed3a7 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -130,15 +130,17 @@ def make_heatmaps(big_list, to_explain, task):
 
     # make directories to hold the heatmaps
     for i in range(6):
-        if not osp.isdir(outpath + f'/class{str(i)}'):
-            os.makedirs(outpath + f'/class{str(i)}')
+        if not osp.isdir(outpath + '/LRP'):
+            os.makedirs(outpath + '/LRP')
+        if not osp.isdir(outpath + f'/LRP/class{str(i)}'):
+            os.makedirs(outpath + f'/LRP/class{str(i)}')
         for j in range(6):
             if task=='regression':
-                if not osp.isdir(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}'):
-                    os.makedirs(outpath + f'/class{str(i)}'+f'/p4_elem{str(j)}')
+                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}'):
+                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}')
             elif task=='classification':
-                if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
-                    os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')
+                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}')
 
     # attempt to break down big_list onto 6 smaller lists, 1 for each pid
     list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
@@ -231,9 +233,9 @@ def get_type(t):
                 plt.colorbar()
                 fig.set_size_inches(12, 12)
                 if task=='regression':
-                    plt.savefig(outpath + f'/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
                 elif task=='classification':
-                    plt.savefig(outpath + f'/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
                 plt.close(fig)
 
 if __name__ == "__main__":
@@ -341,6 +343,9 @@ def hook(model, input, output):
             else:
                 pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
 
+            if not osp.isdir(outpath + '/LRP'):
+                os.makedirs(outpath + '/LRP')
+
             if args.LRP_reg:
                 print('Explaining the p4 predictions:')
                 to_explain_reg = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
@@ -352,8 +357,8 @@ def hook(model, input, output):
                 model.set_dest(to_explain_reg["A"])
 
                 big_list_reg = explainer_reg.explain(to_explain_reg)
-                torch.save(big_list_reg, outpath + f'/big_list_reg.pt')
-                torch.save(to_explain_reg, outpath + f'/to_explain_reg.pt')
+                torch.save(big_list_reg, outpath + '/LRP/big_list_reg.pt')
+                torch.save(to_explain_reg, outpath + '/LRP/to_explain_reg.pt')
 
             if args.LRP_clf:
                 print('Explaining the pid predictions:')
@@ -366,22 +371,23 @@ def hook(model, input, output):
                 model.set_dest(to_explain_clf["A"])
 
                 big_list_clf = explainer_clf.explain(to_explain_clf)
-                torch.save(big_list_clf, outpath + f'/big_list_clf.pt')
-                torch.save(to_explain_clf, outpath + f'/to_explain_clf.pt')
+
+                torch.save(big_list_clf, outpath + '/LRP/big_list_clf.pt')
+                torch.save(to_explain_clf, outpath + '/LRP/to_explain_clf.pt')
 
             break # explain only one single event
 
     if args.make_heatmaps_reg:
         # load the necessary R-scores
-        big_list_reg = torch.load(outpath + f'/big_list_reg.pt', map_location=device)
-        to_explain_reg = torch.load(outpath + f'/to_explain_reg.pt', map_location=device)
+        big_list_reg = torch.load(outpath + '/LRP/big_list_reg.pt', map_location=device)
+        to_explain_reg = torch.load(outpath + '/LRP/to_explain_reg.pt', map_location=device)
 
         make_heatmaps(big_list_reg, to_explain_reg, 'regression')
 
     if args.make_heatmaps_clf:
         # load the necessary R-scores
-        big_list_clf = torch.load(outpath + f'/big_list_clf.pt', map_location=device)
-        to_explain_clf = torch.load(outpath + f'/to_explain_clf.pt', map_location=device)
+        big_list_clf = torch.load(outpath + '/LRP/big_list_clf.pt', map_location=device)
+        to_explain_clf = torch.load(outpath + '/LRP/to_explain_clf.pt', map_location=device)
 
         make_heatmaps(big_list_clf, to_explain_clf, 'classification')
 

From 59375a6f828242314f24784b6ca1675198350c5c Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 26 Jul 2021 11:22:59 +0200
Subject: [PATCH 014/157] feat: Choose optimizer in config file

Choose between Adam, AdamW and SGD. Also specify optimizer
hyperparameters in the config file.
---
 mlpf/pipeline.py                              |  7 +++----
 mlpf/tfmodel/hypertuning.py                   | 13 +++---------
 mlpf/tfmodel/model_setup.py                   |  7 +++----
 mlpf/tfmodel/utils.py                         | 21 ++++++++++++++++++-
 parameters/cms-gnn-dense-big.yaml             | 11 ++++++++++
 parameters/cms-gnn-dense-focal.yaml           | 11 ++++++++++
 parameters/cms-gnn-dense-onecycle.yaml        | 11 ++++++++++
 parameters/cms-gnn-dense-transfer.yaml        | 11 ++++++++++
 parameters/cms-gnn-dense.yaml                 | 11 ++++++++++
 parameters/cms-gnn-skipconn-v2.yaml           | 11 ++++++++++
 parameters/cms-gnn-skipconn.yaml              | 11 ++++++++++
 parameters/cms-transformer-skipconn-gun.yaml  | 11 ++++++++++
 parameters/cms-transformer-skipconn.yaml      | 11 ++++++++++
 parameters/delphes-gnn-skipconn-onecycle.yaml | 11 ++++++++++
 parameters/delphes-gnn-skipconn.yaml          | 11 ++++++++++
 parameters/delphes-transformer-skipconn.yaml  | 11 ++++++++++
 parameters/test-cms-v2.yaml                   | 13 +++++++++++-
 parameters/test-cms.yaml                      | 13 +++++++++++-
 parameters/test-delphes.yaml                  | 13 +++++++++++-
 19 files changed, 197 insertions(+), 22 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index e3bb389be..e967a7186 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -30,6 +30,7 @@
 
 from tfmodel.utils import (
     get_lr_schedule,
+    get_optimizer,
     create_experiment_dir,
     get_strategy,
     make_weight_function,
@@ -47,7 +48,6 @@
     delete_all_but_best_checkpoint,
 )
 
-from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.lr_finder import LRFinder
 from tfmodel.callbacks import CustomTensorBoard
 from tfmodel import hypertuning
@@ -90,11 +90,10 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
     total_steps = n_epochs * n_train // global_batch_size
-    lr = float(config["setup"]["lr"])
 
     with strategy.scope():
-        lr_schedule, optim_callbacks = get_lr_schedule(config, lr=lr, steps=total_steps)
-        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+        lr_schedule, optim_callbacks = get_lr_schedule(config, steps=total_steps)
+        opt = get_optimizer(config, lr_schedule)
 
         if config["setup"]["dtype"] == "float16":
             model_dtype = tf.dtypes.float16
diff --git a/mlpf/tfmodel/hypertuning.py b/mlpf/tfmodel/hypertuning.py
index 6221ad749..1a13f0844 100644
--- a/mlpf/tfmodel/hypertuning.py
+++ b/mlpf/tfmodel/hypertuning.py
@@ -17,16 +17,6 @@
 def get_model_builder(config):
 
     def model_builder(hp):
-        # config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[128, 256])
-        # config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[64, 128, 256])
-        # config["parameters"]["num_conv"] = hp.Choice("num_conv", [2, 3, 4])
-        # config["parameters"]["num_gsl"] = hp.Choice("num_gsl", [2, 3, 4, 5])
-        # config["parameters"]["dropout"] = hp.Choice("dropout", values=[0.0, 0.1, 0.2, 0.3])
-        # config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[200, 640, 800])
-
-        # config["setup"]["lr"] = hp.Choice("lr", values=[5e-4, 1e-4, 5e-5, 1e-5])
-
-
         config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[256])
         config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
         config["parameters"]["num_conv"] = hp.Choice("num_conv", [2, 3])
@@ -35,6 +25,9 @@ def model_builder(hp):
         config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[640])
 
         config["setup"]["lr"] = hp.Choice("lr", values=[1e-4])
+        config["setup"]["batch_size"] = hp.Choice("batch_size", values=[32])
+        config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])
+
 
         model = make_model(config, dtype="float32")
         model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index e2a7a4af4..374f1c804 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -25,7 +25,7 @@
 from pathlib import Path
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.callbacks import CustomTensorBoard
-from tfmodel.utils import get_lr_schedule, make_weight_function, targets_multi_output
+from tfmodel.utils import get_lr_schedule, get_optimizer, make_weight_function, targets_multi_output
 
 
 from tensorflow.keras.metrics import Recall, CategoricalAccuracy
@@ -620,11 +620,10 @@ def main(args, yaml_path, config):
     ygen_val = np.concatenate(ygens)
     ycand_val = np.concatenate(ycands)
 
-    lr = float(config['setup']['lr'])
     with strategy.scope():
         total_steps = n_epochs * n_train // global_batch_size
-        lr_schedule, optim_callbacks = get_lr_schedule(config, lr, steps=total_steps)
-        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+        lr_schedule, optim_callbacks = get_lr_schedule(config, steps=total_steps)
+        opt = get_optimizer(config, lr_schedule)
         if config['setup']['dtype'] == 'float16':
             model_dtype = tf.dtypes.float16
             from tensorflow.keras import mixed_precision
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 909e8c2f3..2686ab051 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -102,7 +102,8 @@ def get_strategy(global_batch_size):
     return strategy, global_batch_size
 
 
-def get_lr_schedule(config, lr, steps):
+def get_lr_schedule(config, steps):
+    lr = float(config["setup"]["lr"])
     callbacks = []
     schedule = config["setup"]["lr_schedule"]
     if schedule == "onecycle":
@@ -136,6 +137,24 @@ def get_lr_schedule(config, lr, steps):
     return lr_schedule, callbacks
 
 
+def get_optimizer(config, lr_schedule=None):
+    if lr_schedule is None:
+        lr = float(config["setup"]["lr"])
+    else:
+        lr = lr_schedule
+    if config["setup"]["optimizer"] == "adam":
+        cfg_adam = config["optimizer"]["adam"]
+        return tf.keras.optimizers.Adam(learning_rate=lr, amsgrad=cfg_adam["amsgrad"])
+    if config["setup"]["optimizer"] == "adamw":
+        cfg_adamw = config["optimizer"]["adamw"]
+        return tfa.optimizers.AdamW(learning_rate=lr, weight_decay=cfg_adamw["weight_decay"], amsgrad=cfg_adamw["amsgrad"])
+    elif config["setup"]["optimizer"] == "sgd":
+        cfg_sgd = config["optimizer"]["sgd"]
+        return tf.keras.optimizers.SGD(learning_rate=lr, momentum=cfg_sgd["momentum"], nesterov=cfg_sgd["nesterov"])
+    else:
+        raise ValueError("Only 'adam' and 'sgd' are supported optimizers, got {}".format(config["setup"]["optimizer"]))
+
+
 def compute_weights_invsqrt(X, y, w):
     wn = tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w)
     wn *= tf.cast(X[:, 0] != 0, tf.float32)
diff --git a/parameters/cms-gnn-dense-big.yaml b/parameters/cms-gnn-dense-big.yaml
index aa3de4a6f..cd2e4d61a 100644
--- a/parameters/cms-gnn-dense-big.yaml
+++ b/parameters/cms-gnn-dense-big.yaml
@@ -52,6 +52,17 @@ setup:
   trainable: all
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+    optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 parameters:
   model: gnn_dense
diff --git a/parameters/cms-gnn-dense-focal.yaml b/parameters/cms-gnn-dense-focal.yaml
index 5db4d2177..26eb833b2 100644
--- a/parameters/cms-gnn-dense-focal.yaml
+++ b/parameters/cms-gnn-dense-focal.yaml
@@ -55,6 +55,17 @@ setup:
   focal_loss_gamma: 3.0
   focal_loss_from_logits: False
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+    optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index ce6fcc2fb..432e2e368 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -54,6 +54,17 @@ setup:
   trainable: all
   classification_loss_type: categorical_cross_entropy  # categorical_cross_entropy, sigmoid_focal_crossentropy
   lr_schedule: onecycle  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: inverse_sqrt
diff --git a/parameters/cms-gnn-dense-transfer.yaml b/parameters/cms-gnn-dense-transfer.yaml
index 8b735f859..8baaa47e3 100644
--- a/parameters/cms-gnn-dense-transfer.yaml
+++ b/parameters/cms-gnn-dense-transfer.yaml
@@ -52,6 +52,17 @@ setup:
   trainable: transfer
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 parameters:
   model: gnn_dense
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index d74c0d530..aa2ecbadf 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -52,6 +52,17 @@ setup:
   trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: inverse_sqrt
diff --git a/parameters/cms-gnn-skipconn-v2.yaml b/parameters/cms-gnn-skipconn-v2.yaml
index e69919342..0e1341c35 100644
--- a/parameters/cms-gnn-skipconn-v2.yaml
+++ b/parameters/cms-gnn-skipconn-v2.yaml
@@ -52,6 +52,17 @@ setup:
   trainable: all
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 parameters:
   model: gnn
diff --git a/parameters/cms-gnn-skipconn.yaml b/parameters/cms-gnn-skipconn.yaml
index b1d2e50f0..721de3356 100644
--- a/parameters/cms-gnn-skipconn.yaml
+++ b/parameters/cms-gnn-skipconn.yaml
@@ -52,6 +52,17 @@ setup:
   trainable: all
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 parameters:
   model: gnn
diff --git a/parameters/cms-transformer-skipconn-gun.yaml b/parameters/cms-transformer-skipconn-gun.yaml
index f1fdd39e9..4c796ea46 100644
--- a/parameters/cms-transformer-skipconn-gun.yaml
+++ b/parameters/cms-transformer-skipconn-gun.yaml
@@ -53,6 +53,17 @@ setup:
   trainable: all
   multi_output: yes
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 parameters:
   model: transformer
diff --git a/parameters/cms-transformer-skipconn.yaml b/parameters/cms-transformer-skipconn.yaml
index 0cb6eeb31..a7af6b7c0 100644
--- a/parameters/cms-transformer-skipconn.yaml
+++ b/parameters/cms-transformer-skipconn.yaml
@@ -51,6 +51,17 @@ setup:
   trainable: cls
   multi_output: yes
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 parameters:
   model: transformer
diff --git a/parameters/delphes-gnn-skipconn-onecycle.yaml b/parameters/delphes-gnn-skipconn-onecycle.yaml
index 16259d6b6..ab4df41d9 100644
--- a/parameters/delphes-gnn-skipconn-onecycle.yaml
+++ b/parameters/delphes-gnn-skipconn-onecycle.yaml
@@ -47,6 +47,17 @@ setup:
   multi_output: yes
   classification_loss_type: categorical_cross_entropy
   lr_schedule: onecycle  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none
diff --git a/parameters/delphes-gnn-skipconn.yaml b/parameters/delphes-gnn-skipconn.yaml
index 0f83160a2..83c83c415 100644
--- a/parameters/delphes-gnn-skipconn.yaml
+++ b/parameters/delphes-gnn-skipconn.yaml
@@ -41,6 +41,17 @@ setup:
   multi_output: no
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none
diff --git a/parameters/delphes-transformer-skipconn.yaml b/parameters/delphes-transformer-skipconn.yaml
index 9874e5289..e59f801a1 100644
--- a/parameters/delphes-transformer-skipconn.yaml
+++ b/parameters/delphes-transformer-skipconn.yaml
@@ -39,6 +39,17 @@ setup:
   trainable: all
   multi_output: no
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 3b14e661a..10ad0e44e 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -38,7 +38,18 @@ setup:
   dtype: float32
   trainable: all
   classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  lr_schedule: exponentialdecay
+  optimizer: adam
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index a6e4f1967..4de5275e7 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -38,7 +38,18 @@ setup:
   dtype: float32
   trainable: all
   classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  lr_schedule: exponentialdecay
+  optimizer: adam
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index 87c7208fe..caccb02fc 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -37,7 +37,18 @@ setup:
   dtype: float32
   trainable: all
   classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  lr_schedule: exponentialdecay
+  optimizer: adam
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
 
 sample_weights:
   cls: none

From 532228d167d15692e9184db84311ae13bbe6d7bd Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 26 Jul 2021 17:14:23 +0200
Subject: [PATCH 015/157] feat: Hypertune parameters in config, LR scheduling
 during hypertuning

---
 mlpf/pipeline.py                       | 20 +++++++++++++-------
 mlpf/tfmodel/hypertuning.py            | 14 ++++++++------
 parameters/cms-gnn-dense-onecycle.yaml | 10 +++++++++-
 parameters/cms-gnn-dense.yaml          | 14 +++++++++++---
 4 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index e967a7186..b7b9ddc97 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -320,13 +320,18 @@ def delete_all_but_best_ckpt(train_dir, dry_run):
 def hypertune(config, outdir, ntrain, ntest, recreate):
     config, _, global_batch_size, n_train, n_test, n_epochs, _ = parse_config(config, ntrain, ntest)
 
+    # Override number of epochs with value from Hyperband config
+    cfg_hb = config["hypertune"]["hyperband"]
+    n_epochs = cfg_hb["max_epochs"]
+
     ds_train_r, ds_test_r, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test)
 
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
+    total_steps = n_epochs * n_train // global_batch_size
 
-    model_builder = hypertuning.get_model_builder(config)
+    model_builder, optim_callbacks = hypertuning.get_model_builder(config, total_steps)
 
     tb = CustomTensorBoard(
             log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
@@ -337,24 +342,25 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
 
     tuner = kt.Hyperband(
         model_builder,
-        objective="val_loss",
-        max_epochs=n_epochs,
-        factor=3,
-        hyperband_iterations=3,
+        objective=cfg_hb["objective"],
+        max_epochs=cfg_hb["max_epochs"],
+        factor=cfg_hb["factor"],
+        hyperband_iterations=cfg_hb["iterations"],
         directory=outdir + "/tb",
         project_name="mlpf",
         overwrite=recreate,
-        executions_per_trial=1,
+        executions_per_trial=cfg_hb["executions_per_trial"],
         distribution_strategy=strategy,
     )
 
     tuner.search(
         ds_train_r,
+        epochs=n_epochs,
         validation_data=ds_test_r,
         steps_per_epoch=n_train // global_batch_size,
         validation_steps=n_test // global_batch_size,
         #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
-        callbacks=[tb],
+        callbacks=[tb] + optim_callbacks,
     )
 
     tuner.results_summary()
diff --git a/mlpf/tfmodel/hypertuning.py b/mlpf/tfmodel/hypertuning.py
index 1a13f0844..e57512fc3 100644
--- a/mlpf/tfmodel/hypertuning.py
+++ b/mlpf/tfmodel/hypertuning.py
@@ -7,6 +7,7 @@
 
 from tfmodel.utils import (
     get_lr_schedule,
+    get_optimizer,
     load_config,
     set_config_loss,
     get_loss_dict,
@@ -14,8 +15,8 @@
 )
 
 
-def get_model_builder(config):
-
+def get_model_builder(config, total_steps):
+    _, optim_callbacks = get_lr_schedule(config, steps=total_steps)
     def model_builder(hp):
         config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[256])
         config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
@@ -24,15 +25,16 @@ def model_builder(hp):
         config["parameters"]["dropout"] = hp.Choice("dropout", values=[0.2])
         config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[640])
 
-        config["setup"]["lr"] = hp.Choice("lr", values=[1e-4])
-        config["setup"]["batch_size"] = hp.Choice("batch_size", values=[32])
+        config["setup"]["lr"] = hp.Choice("lr", values=[1e-4, 3e-4])
+        config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"])
         config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])
 
 
         model = make_model(config, dtype="float32")
         model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
-        opt = tf.keras.optimizers.Adam(learning_rate=config["setup"]["lr"])
+        lr_schedule, _ = get_lr_schedule(config, steps=total_steps)
+        opt = get_optimizer(config, lr_schedule)
 
         loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
@@ -49,4 +51,4 @@ def model_builder(hp):
         )
         return model
 
-    return model_builder
+    return model_builder, optim_callbacks
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 432e2e368..6b2493296 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -105,4 +105,12 @@ onecycle:
   mom_max: 0.95
   warmup_ratio: 0.3
   div_factor: 25.0
-  final_div: 100000.0
\ No newline at end of file
+  final_div: 100000.0
+
+hypertune:
+  hyperband:
+    objective: "val_loss"
+    max_epochs: 500
+    factor: 2
+    iterations: 1
+    executions_per_trial: 1
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index aa2ecbadf..3093dd87b 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -41,15 +41,15 @@ tensorflow:
 setup:
   train: yes
   weights:
-  weights_config: classification
-  lr: 2e-5
+  weights_config:
+  lr: 1e-4
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 500
   num_val_files: 100
   dtype: float32
-  trainable: classification
+  trainable: all
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
   optimizer: adam  # adam, adamw, sgd
@@ -97,3 +97,11 @@ exponentialdecay:
   decay_steps: 10000
   decay_rate: 0.99
   staircase: yes
+
+hypertune:
+  hyperband:
+    objective: "val_loss"
+    max_epochs: 500
+    factor: 2
+    iterations: 1
+    executions_per_trial: 1

From ee44badcdc024554911becb3c0e36173e8cf173f Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 26 Jul 2021 20:48:35 +0200
Subject: [PATCH 016/157] feat: Add Random Search and Bayesian Optimization to
 hypertune

---
 mlpf/pipeline.py      | 21 ++++++---------------
 mlpf/tfmodel/utils.py | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index b7b9ddc97..956758b52 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -46,6 +46,7 @@
     parse_config,
     get_best_checkpoint,
     delete_all_but_best_checkpoint,
+    get_tuner,
 )
 
 from tfmodel.lr_finder import LRFinder
@@ -320,9 +321,9 @@ def delete_all_but_best_ckpt(train_dir, dry_run):
 def hypertune(config, outdir, ntrain, ntest, recreate):
     config, _, global_batch_size, n_train, n_test, n_epochs, _ = parse_config(config, ntrain, ntest)
 
-    # Override number of epochs with value from Hyperband config
-    cfg_hb = config["hypertune"]["hyperband"]
-    n_epochs = cfg_hb["max_epochs"]
+    # Override number of epochs with max_epochs from Hyperband config if specified
+    if config["hypertune"]["algorithm"] == "hyperband":
+        n_epochs = config["hypertune"]["hyperband"]["max_epochs"]
 
     ds_train_r, ds_test_r, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test)
 
@@ -340,18 +341,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
     # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
     tb.__class__.__name__ = "TensorBoard"
 
-    tuner = kt.Hyperband(
-        model_builder,
-        objective=cfg_hb["objective"],
-        max_epochs=cfg_hb["max_epochs"],
-        factor=cfg_hb["factor"],
-        hyperband_iterations=cfg_hb["iterations"],
-        directory=outdir + "/tb",
-        project_name="mlpf",
-        overwrite=recreate,
-        executions_per_trial=cfg_hb["executions_per_trial"],
-        distribution_strategy=strategy,
-    )
+    tuner = get_tuner(config["hypertune"], model_builder, outdir, recreate, strategy)
 
     tuner.search(
         ds_train_r,
@@ -362,6 +352,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
         callbacks=[tb] + optim_callbacks,
     )
+    print("Hyperparamter search complete.")
 
     tuner.results_summary()
     for trial in tuner.oracle.get_best_trials(num_trials=10):
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 2686ab051..a9a3161d9 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -11,6 +11,7 @@
 
 import tensorflow as tf
 import tensorflow_addons as tfa
+import keras_tuner as kt
 
 from tfmodel.data import Dataset
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -154,6 +155,44 @@ def get_optimizer(config, lr_schedule=None):
     else:
         raise ValueError("Only 'adam' and 'sgd' are supported optimizers, got {}".format(config["setup"]["optimizer"]))
 
+def get_tuner(cfg_hypertune, model_builder, outdir, recreate, strategy):
+    if cfg_hypertune["algorithm"] == "random":
+        print("Keras Tuner: Using RandomSearch")
+        cfg_rand = cfg_hypertune["random"]
+        return kt.RandomSearch(
+            model_builder,
+            objective=cfg_rand["objective"],
+            max_trials=cfg_rand["max_trials"],
+            project_name="mlpf",
+            overwrite=recreate,
+        )
+    elif cfg_hypertune["algorithm"] == "bayesian":
+        print("Keras Tuner: Using BayesianOptimization")
+        cfg_bayes = cfg_hypertune["bayesian"]
+        return kt.BayesianOptimization(
+            model_builder,
+            objective=cfg_bayes["objective"],
+            max_trials=cfg_bayes["max_trials"],
+            num_initial_points=cfg_bayes["num_initial_points"],
+            project_name="mlpf",
+            overwrite=recreate,
+        )
+    elif cfg_hypertune["algorithm"] == "hyperband":
+        print("Keras Tuner: Using Hyperband")
+        cfg_hb = cfg_hypertune["hyperband"]
+        return kt.Hyperband(
+            model_builder,
+            objective=cfg_hb["objective"],
+            max_epochs=cfg_hb["max_epochs"],
+            factor=cfg_hb["factor"],
+            hyperband_iterations=cfg_hb["iterations"],
+            directory=outdir + "/tb",
+            project_name="mlpf",
+            overwrite=recreate,
+            executions_per_trial=cfg_hb["executions_per_trial"],
+            distribution_strategy=strategy,
+        )
+
 
 def compute_weights_invsqrt(X, y, w):
     wn = tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w)

From 81c888da38a102f4065e44c6f86f9701d3f332e4 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 26 Jul 2021 21:17:27 +0200
Subject: [PATCH 017/157] chore: add hypertune settings to config files

---
 parameters/cms-gnn-dense-onecycle.yaml | 10 +++++++++-
 parameters/cms-gnn-dense.yaml          | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 6b2493296..3313d875c 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -108,8 +108,16 @@ onecycle:
   final_div: 100000.0
 
 hypertune:
+  algorithm: hyperband  # random, bayesian, hyperband
+  random:
+    objective: val_loss
+    max_trials: 1000
+  bayesian:
+    objective: val_loss
+    max_trials: 1000
+    num_initial_points: 2
   hyperband:
-    objective: "val_loss"
+    objective: val_loss
     max_epochs: 500
     factor: 2
     iterations: 1
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 3093dd87b..ae5f6768f 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -99,8 +99,16 @@ exponentialdecay:
   staircase: yes
 
 hypertune:
+  algorithm: hyperband  # random, bayesian, hyperband
+  random:
+    objective: val_loss
+    max_trials: 1000
+  bayesian:
+    objective: val_loss
+    max_trials: 1000
+    num_initial_points: 2
   hyperband:
-    objective: "val_loss"
+    objective: val_loss
     max_epochs: 500
     factor: 2
     iterations: 1

From 18845f418963f57759b9835804c38d5b5a6b8099 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 27 Jul 2021 12:24:38 +0200
Subject: [PATCH 018/157] removed numba and removed hooks.py test file

---
 mlpf/pytorch_delphes/LRP/hooks.py           | 106 --------------------
 mlpf/pytorch_delphes/LRP/main_clf.py        |   2 +-
 mlpf/pytorch_delphes/LRP/main_dnn.py        |   2 +-
 mlpf/pytorch_delphes/LRP/main_reg.py        |   2 +-
 mlpf/pytorch_delphes/graph_data_delphes.py  |   3 -
 mlpf/pytorch_delphes/training.py            |   2 +-
 mlpf/pytorch_delphes/training_dnn.py        |   2 +-
 mlpf/pytorch_delphes/training_embeddings.py |   2 +-
 8 files changed, 6 insertions(+), 115 deletions(-)
 delete mode 100644 mlpf/pytorch_delphes/LRP/hooks.py

diff --git a/mlpf/pytorch_delphes/LRP/hooks.py b/mlpf/pytorch_delphes/LRP/hooks.py
deleted file mode 100644
index 6fc7fa279..000000000
--- a/mlpf/pytorch_delphes/LRP/hooks.py
+++ /dev/null
@@ -1,106 +0,0 @@
-from glob import glob
-import sys, os
-import os.path as osp
-import pickle, math, time, numba, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib, mplhep
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-
-sys.path.insert(1, '../')
-sys.path.insert(1, '../../../plotting/')
-sys.path.insert(1, '../../../mlpf/plotting/')
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-import evaluate
-from evaluate import make_plots, Evaluate
-from plot_utils import plot_confusion_matrix
-from model_LRP import PFNet7
-
-from LRP import LRP
-from model_io import model_io
-import torch
-import torch.nn as nn
-
-activation={}
-def get_activation(name):
-    def hook(model, input, output):
-        activation[name] = input[0]
-    return hook
-
-class myNet(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.conv = nn.Conv2d(3,10,2, stride = 2)
-    self.relu = nn.ReLU()
-    self.flatten = lambda x: x.view(-1)
-    self.fc1 = nn.Linear(160,5)
-
-
-
-  def forward(self, x):
-    x = self.relu(self.conv(x))
-    x.register_hook(lambda grad : torch.clamp(grad, min = 0))     #No gradient shall be backpropagated
-                                                                  #conv outside less than 0
-
-    # print whether there is any negative grad
-    s=x.register_hook(lambda grad: torch.zeros(grad.shape))
-    return self.fc1(self.flatten(x))
-
-
-net = myNet()
-print(net)
-
-for name, param in net.named_parameters():
-  # if the param is from a linear and is a bias
-  if "fc" in name and "bias" in name:
-    param.register_hook(lambda grad: torch.zeros(grad.shape))
-
-
-out = net(torch.randn(1,3,8,8))
-
-(1 - out).mean().backward()
-
-print("The biases are", net.fc1.bias.grad)     #bias grads are zero
-
-
-print(s)
diff --git a/mlpf/pytorch_delphes/LRP/main_clf.py b/mlpf/pytorch_delphes/LRP/main_clf.py
index 5644e5c15..e22d316cd 100644
--- a/mlpf/pytorch_delphes/LRP/main_clf.py
+++ b/mlpf/pytorch_delphes/LRP/main_clf.py
@@ -3,7 +3,7 @@
 import os.path as osp
 import pickle as pkl
 import _pickle as cPickle
-import math, time, numba, tqdm
+import math, time, tqdm
 import numpy as np
 import pandas as pd
 import sklearn
diff --git a/mlpf/pytorch_delphes/LRP/main_dnn.py b/mlpf/pytorch_delphes/LRP/main_dnn.py
index afabace0f..fee51e022 100644
--- a/mlpf/pytorch_delphes/LRP/main_dnn.py
+++ b/mlpf/pytorch_delphes/LRP/main_dnn.py
@@ -3,7 +3,7 @@
 import os.path as osp
 import pickle as pkl
 import _pickle as cPickle
-import math, time, numba, tqdm
+import math, time, tqdm
 import numpy as np
 import pandas as pd
 import sklearn
diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index d37aed3a7..b1816ca39 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -3,7 +3,7 @@
 import os.path as osp
 import pickle as pkl
 import _pickle as cPickle
-import math, time, numba, tqdm
+import math, time, tqdm
 import numpy as np
 import pandas as pd
 import sklearn
diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
index 68dd5ced5..f47d54dbc 100644
--- a/mlpf/pytorch_delphes/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -5,10 +5,7 @@
 import torch_geometric
 import torch_geometric.utils
 from torch_geometric.data import Dataset, Data, Batch
-import itertools
 from glob import glob
-import numba
-from numpy.lib.recfunctions import append_fields
 
 import pickle
 import scipy
diff --git a/mlpf/pytorch_delphes/training.py b/mlpf/pytorch_delphes/training.py
index b0eb9dcc2..e7d81b78b 100644
--- a/mlpf/pytorch_delphes/training.py
+++ b/mlpf/pytorch_delphes/training.py
@@ -5,7 +5,7 @@
 
 import os.path as osp
 import pickle as pkl
-import math, time, numba, tqdm
+import math, time, tqdm
 import numpy as np
 import pandas as pd
 import sklearn
diff --git a/mlpf/pytorch_delphes/training_dnn.py b/mlpf/pytorch_delphes/training_dnn.py
index 16315eb75..7d5316bb3 100644
--- a/mlpf/pytorch_delphes/training_dnn.py
+++ b/mlpf/pytorch_delphes/training_dnn.py
@@ -5,7 +5,7 @@
 
 import os.path as osp
 import pickle as pkl
-import math, time, numba, tqdm
+import math, time, tqdm
 import numpy as np
 import pandas as pd
 import sklearn
diff --git a/mlpf/pytorch_delphes/training_embeddings.py b/mlpf/pytorch_delphes/training_embeddings.py
index 7d95863b8..353473786 100644
--- a/mlpf/pytorch_delphes/training_embeddings.py
+++ b/mlpf/pytorch_delphes/training_embeddings.py
@@ -5,7 +5,7 @@
 
 import os.path as osp
 import pickle as pkl
-import math, time, numba, tqdm
+import math, time, tqdm
 import numpy as np
 import pandas as pd
 import sklearn

From 7cef92f89f1ffd8a87cd80f71604407370c3c6c1 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 27 Jul 2021 12:36:07 +0200
Subject: [PATCH 019/157] better device definitions

---
 mlpf/pytorch_delphes/DDP_tutorial.py       | 166 ---------------------
 mlpf/pytorch_delphes/data_preprocessing.py |  42 ------
 mlpf/pytorch_delphes/evaluate.py           |   6 -
 mlpf/pytorch_delphes/model.py              |  26 ----
 mlpf/pytorch_delphes/model_embeddings.py   |   9 --
 5 files changed, 249 deletions(-)
 delete mode 100644 mlpf/pytorch_delphes/DDP_tutorial.py

diff --git a/mlpf/pytorch_delphes/DDP_tutorial.py b/mlpf/pytorch_delphes/DDP_tutorial.py
deleted file mode 100644
index fe96d296b..000000000
--- a/mlpf/pytorch_delphes/DDP_tutorial.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import os
-import sys
-import tempfile
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.optim as optim
-import torch.multiprocessing as mp
-
-from torch.nn.parallel import DistributedDataParallel as DDP
-
-# On Windows platform, the torch.distributed package only
-# supports Gloo backend, FileStore and TcpStore.
-# For FileStore, set init_method parameter in init_process_group
-# to a local file. Example as follow:
-# init_method="file:///f:/libtmp/some_file"
-# dist.init_process_group(
-#    "gloo",
-#    rank=rank,
-#    init_method=init_method,
-#    world_size=world_size)
-# For TcpStore, same way as on Linux.
-
-def setup(rank, world_size):
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12355'
-
-    # initialize the process group
-    dist.init_process_group("gloo", rank=rank, world_size=world_size)
-
-def cleanup():
-    dist.destroy_process_group()
-
-
-class ToyModel(nn.Module):
-    def __init__(self):
-        super(ToyModel, self).__init__()
-        self.net1 = nn.Linear(10, 10)
-        self.relu = nn.ReLU()
-        self.net2 = nn.Linear(10, 5)
-
-    def forward(self, x):
-        return self.net2(self.relu(self.net1(x)))
-
-
-def demo_basic(rank, world_size):
-    print(f"Running basic DDP example on rank {rank}.")
-    setup(rank, world_size)
-
-    # create model and move it to GPU with id rank
-    model = ToyModel().to(rank)
-    ddp_model = DDP(model, device_ids=[rank])
-
-    loss_fn = nn.MSELoss()
-    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
-
-    optimizer.zero_grad()
-    outputs = ddp_model(torch.randn(20, 10))
-    labels = torch.randn(20, 5).to(rank)
-    loss_fn(outputs, labels).backward()
-    optimizer.step()
-
-    cleanup()
-
-
-def run_demo(demo_fn, world_size):
-    mp.spawn(demo_fn,
-             args=(world_size,),
-             nprocs=world_size,
-             join=True)
-
-
-
-def demo_checkpoint(rank, world_size):
-    print(f"Running DDP checkpoint example on rank {rank}.")
-    setup(rank, world_size)
-
-    model = ToyModel().to(rank)
-    ddp_model = DDP(model, device_ids=[rank])
-
-    loss_fn = nn.MSELoss()
-    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
-
-    CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
-    if rank == 0:
-        # All processes should see same parameters as they all start from same
-        # random parameters and gradients are synchronized in backward passes.
-        # Therefore, saving it in one process is sufficient.
-        torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
-
-    # Use a barrier() to make sure that process 1 loads the model after process
-    # 0 saves it.
-    dist.barrier()
-    # configure map_location properly
-    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
-    ddp_model.load_state_dict(
-        torch.load(CHECKPOINT_PATH, map_location=map_location))
-
-    optimizer.zero_grad()
-    outputs = ddp_model(torch.randn(20, 10))
-    labels = torch.randn(20, 5).to(rank)
-    loss_fn = nn.MSELoss()
-    loss_fn(outputs, labels).backward()
-    optimizer.step()
-
-    # Not necessary to use a dist.barrier() to guard the file deletion below
-    # as the AllReduce ops in the backward pass of DDP already served as
-    # a synchronization.
-
-    if rank == 0:
-        os.remove(CHECKPOINT_PATH)
-
-    cleanup()
-
-
-
-class ToyMpModel(nn.Module):
-    def __init__(self, dev0, dev1):
-        super(ToyMpModel, self).__init__()
-        self.dev0 = dev0
-        self.dev1 = dev1
-        self.net1 = torch.nn.Linear(10, 10).to(dev0)
-        self.relu = torch.nn.ReLU()
-        self.net2 = torch.nn.Linear(10, 5).to(dev1)
-
-    def forward(self, x):
-        x = x.to(self.dev0)
-        x = self.relu(self.net1(x))
-        x = x.to(self.dev1)
-        return self.net2(x)
-
-
-
-def demo_model_parallel(rank, world_size):
-    print(f"Running DDP with model parallel example on rank {rank}.")
-    setup(rank, world_size)
-
-    # setup mp_model and devices for this process
-    dev0 = rank * 2
-    dev1 = rank * 2 + 1
-    mp_model = ToyMpModel(dev0, dev1)
-    ddp_mp_model = DDP(mp_model)
-
-    loss_fn = nn.MSELoss()
-    optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
-
-    optimizer.zero_grad()
-    # outputs will be on dev1
-    outputs = ddp_mp_model(torch.randn(20, 10))
-    labels = torch.randn(20, 5).to(dev1)
-    loss_fn(outputs, labels).backward()
-    optimizer.step()
-
-    cleanup()
-
-
-if __name__ == "__main__":
-    n_gpus = torch.cuda.device_count()
-    if n_gpus < 2:
-      print(f"Requires at least 2 GPUs to run, but got {n_gpus}.")
-    else:
-      # run_demo(demo_basic, 2)
-      # run_demo(demo_checkpoint, 2)
-      # run_demo(demo_model_parallel, 1)
-
-      demo_basic((2,), 2)
diff --git a/mlpf/pytorch_delphes/data_preprocessing.py b/mlpf/pytorch_delphes/data_preprocessing.py
index 889ae7285..f504620d3 100644
--- a/mlpf/pytorch_delphes/data_preprocessing.py
+++ b/mlpf/pytorch_delphes/data_preprocessing.py
@@ -2,15 +2,6 @@
 import torch
 from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
 
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
 # if not multigpu we have to pass batches that are stacked as "batch.type() = Batch" (not list) so that pytorch can access attributes like ygen_id through batch.ygen_id
 # if multigpu we have to pass list of "Data" elements.. then behind the scene, pytorch DP will convert the list to appropriate Batches to fit on the gpus available so that batch.ygen_id works out of the box
 
@@ -59,36 +50,3 @@ def data_to_loader_qcd(full_dataset, n_test, batch_size):
     return test_loader
 
 #----------------------------------------------------------------------------------------
-# from graph_data_delphes import PFGraphDataset, one_hot_embedding
-# # the next part initializes some args values (to run the script not from terminal)
-# class objectview(object):
-#     def __init__(self, d):
-#         self.__dict__ = d
-#
-# args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 1, 'patience': 100, 'hidden_dim':32, 'encoding_dim': 256,
-# 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
-# 'outpath': '../../test_tmp_delphes/experiments/', 'activation': 'leaky_relu', 'optimizer': 'adam', 'lr': 1e-4, 'l1': 1, 'l2': 0.001, 'l3': 1, 'dropout': 0.5,
-# 'radius': 0.1, 'convlayer': 'gravnet-knn', 'convlayer2': 'none', 'space_dim': 2, 'nearest': 3, 'overwrite': True,
-# 'input_encoding': 0, 'load': False, 'load_epoch': 0, 'load_model': 'PFNet7_cand_ntrain_3_nepochs_1', 'evaluate': True, 'evaluate_on_cpu': True})
-#
-# full_dataset = PFGraphDataset(args.dataset)
-# full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, args.n_train, args.n_valid, batch_size=args.batch_size)
-# test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-#
-# for batch in train_loader:
-#     break
-#
-# batch
-# len(train_loader)
-#
-#
-# # if multigpu: a "Batch" of size 3 is given by: [Data(x=[5k, 12], ycand=[5k, 6], ...) , Data(x=[5k, 12], ...), Data(x=[5k, 12], ...)]
-# # then when we pass it to the model, DP takes care of converting it into batches like this (for 2 gpus):
-# # Batch(batch=[2*5k], x=[2*5k, 12], ...)
-# # Batch(batch=[5k], x=[5k, 12], ...)
-#
-# # if not multigpu: a "Batch" of size 2 is directly given by: Batch(batch=(2*5k), x=(2*5k,12), ...)
-# # Note: batch is a column vector which maps each node to its respective graph in the batch:
-# batch.batch
diff --git a/mlpf/pytorch_delphes/evaluate.py b/mlpf/pytorch_delphes/evaluate.py
index ba59c0335..2c6ced57c 100644
--- a/mlpf/pytorch_delphes/evaluate.py
+++ b/mlpf/pytorch_delphes/evaluate.py
@@ -34,12 +34,6 @@
 use_gpu = torch.cuda.device_count()>0
 multi_gpu = torch.cuda.device_count()>1
 
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
 from plot_utils import plot_confusion_matrix
 from plots import plot_regression, plot_distributions_pid, plot_distributions_all, plot_pt_eta, plot_num_particles_pid, draw_efficiency_fakerate, get_eff, get_fake, plot_reso
 
diff --git a/mlpf/pytorch_delphes/model.py b/mlpf/pytorch_delphes/model.py
index ac0cf251a..1bfb7aba8 100644
--- a/mlpf/pytorch_delphes/model.py
+++ b/mlpf/pytorch_delphes/model.py
@@ -19,16 +19,6 @@
 from gravnet import GravNetConv
 from torch_geometric.nn import GraphConv
 
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
 #Model with gravnet clustering
 class PFNet7(nn.Module):
     def __init__(self,
@@ -111,19 +101,3 @@ def forward(self, data):
         return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
 
 # # -------------------------------------------------------------------------------------
-# # uncomment to test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import data_to_loader_ttbar
-# from data_preprocessing import data_to_loader_qcd
-#
-# full_dataset = PFGraphDataset('../../../test_tmp_delphes/data/pythia8_ttbar')
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
-#
-# model = PFNet7()
-# model.to(device)
-#
-# for batch in train_loader:
-#     X = batch.to(device)
-#     pred_ids, pred_p4, gen_ids, gen_p4, cand_ids, cand_p4 = model(X)
-#     break
diff --git a/mlpf/pytorch_delphes/model_embeddings.py b/mlpf/pytorch_delphes/model_embeddings.py
index 89029e3b5..042016083 100644
--- a/mlpf/pytorch_delphes/model_embeddings.py
+++ b/mlpf/pytorch_delphes/model_embeddings.py
@@ -19,15 +19,6 @@
 from gravnet import GravNetConv
 from torch_geometric.nn import GraphConv
 
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
 #Model with gravnet clustering
 class PFNet7(nn.Module):
     def __init__(self,

From 2bc4506329a50122bf8ffe16ba7c8783525a32a0 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 27 Jul 2021 12:36:38 +0200
Subject: [PATCH 020/157] make it clear the difference between gravnet.py and
 gravnet_LRP.py

---
 mlpf/pytorch_delphes/LRP/gravnet_LRP.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlpf/pytorch_delphes/LRP/gravnet_LRP.py b/mlpf/pytorch_delphes/LRP/gravnet_LRP.py
index 5a09981fd..67da14e1b 100644
--- a/mlpf/pytorch_delphes/LRP/gravnet_LRP.py
+++ b/mlpf/pytorch_delphes/LRP/gravnet_LRP.py
@@ -14,9 +14,11 @@
 
 # copied it from pytorch_geometric source code
 # ADDED: retrieve edge_index, retrieve edge_weight
-# ADDED: retrieve before and after message MessagePassing
 # CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
 # REMOVED: skip connection
+
+# ADDED: retrieve before and after message MessagePassing
+
 class GravNetConv(MessagePassing):
     r"""The GravNet operator from the `"Learning Representations of Irregular
     Particle-detector Geometry with Distance-weighted Graph

From c510ba330f87532c53b74f37ff575427f3378cbb Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 27 Jul 2021 12:37:08 +0200
Subject: [PATCH 021/157] fixed typo

---
 scripts/local_test_delphes_pytorch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index 0233ad793..c0b7cd410 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -1,4 +1,4 @@
-# !/bin/bash
+#!/bin/bash
 set -e
 
 rm -Rf test_tmp_delphes

From 8ad7ee80a67954ffc7a923e9bc49a9b61efe5d46 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 27 Jul 2021 13:03:19 +0200
Subject: [PATCH 022/157] rename training, removed unnecessary files

---
 mlpf/pytorch_delphes/LRP/LRP_dnn.py           | 168 ------
 mlpf/pytorch_delphes/LRP/main_clf.py          | 360 ------------
 mlpf/pytorch_delphes/LRP/main_dnn.py          | 227 --------
 mlpf/pytorch_delphes/LRP/model_LRP_clf.py     |  98 ----
 mlpf/pytorch_delphes/LRP/model_LRP_dnn.py     |  66 ---
 mlpf/pytorch_delphes/LRP/model_LRP_reg.py     |  22 +-
 mlpf/pytorch_delphes/model.py                 |   2 -
 mlpf/pytorch_delphes/model_dnn.py             | 126 -----
 mlpf/pytorch_delphes/model_embeddings.py      | 168 ------
 .../{training.py => pipeline.py}              |   0
 mlpf/pytorch_delphes/training_dnn.py          | 497 -----------------
 mlpf/pytorch_delphes/training_embeddings.py   | 512 ------------------
 scripts/local_test_delphes_pytorch.sh         |   4 +-
 13 files changed, 3 insertions(+), 2247 deletions(-)
 delete mode 100644 mlpf/pytorch_delphes/LRP/LRP_dnn.py
 delete mode 100644 mlpf/pytorch_delphes/LRP/main_clf.py
 delete mode 100644 mlpf/pytorch_delphes/LRP/main_dnn.py
 delete mode 100644 mlpf/pytorch_delphes/LRP/model_LRP_clf.py
 delete mode 100644 mlpf/pytorch_delphes/LRP/model_LRP_dnn.py
 delete mode 100644 mlpf/pytorch_delphes/model_dnn.py
 delete mode 100644 mlpf/pytorch_delphes/model_embeddings.py
 rename mlpf/pytorch_delphes/{training.py => pipeline.py} (100%)
 delete mode 100644 mlpf/pytorch_delphes/training_dnn.py
 delete mode 100644 mlpf/pytorch_delphes/training_embeddings.py

diff --git a/mlpf/pytorch_delphes/LRP/LRP_dnn.py b/mlpf/pytorch_delphes/LRP/LRP_dnn.py
deleted file mode 100644
index 770d957ff..000000000
--- a/mlpf/pytorch_delphes/LRP/LRP_dnn.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import Sequential as Seq,Linear,ReLU,BatchNorm1d
-from torch_scatter import scatter_mean
-import numpy as np
-import json
-import model_io
-from torch_geometric.utils import to_scipy_sparse_matrix
-import scipy
-import pickle, math, time
-import _pickle as cPickle
-
-from torch_geometric.data import Data
-import networkx as nx
-from torch_geometric.utils.convert import to_networkx
-
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-class LRP:
-    EPSILON=1e-9
-
-    def __init__(self,model:model_io):
-        self.model=model
-
-    def register_model(model:model_io):
-        self.model=model
-
-    """
-    LRP rules
-    """
-    @staticmethod
-    def eps_rule(layer, input, R, index, output_layer, activation_layer):
-
-        EPSILON=1e-9
-        a=copy_tensor(input)
-        a.retain_grad()
-        z = layer.forward(a)
-        # basically layer.forward does this: output=(torch.matmul(a,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
-
-        if activation_layer:
-            w = torch.eye(a.shape[1])
-        else:
-            w = layer.weight
-            b = layer.bias
-
-        wt = torch.transpose(w,0,1)
-
-        if output_layer:
-            R_list = [None]*R.shape[1]
-            Wt = [None]*R.shape[1]
-            for output_node in range(R.shape[1]):
-                R_list[output_node]=(R[:,output_node].reshape(-1,1).clone())
-                Wt[output_node]=(wt[:,output_node].reshape(-1,1))
-        else:
-            R_list = R
-            Wt = [wt]*len(R_list)
-
-        R_previous=[None]*len(R_list)
-        for output_node in range(len(R_list)):
-            # rep stands for repeated
-            a_rep = a.reshape(a.shape[0],a.shape[1],1).expand(-1,-1,R_list[output_node].shape[1])
-            wt_rep = Wt[output_node].reshape(1,Wt[output_node].shape[0],Wt[output_node].shape[1]).expand(a.shape[0],-1,-1)
-
-            H = a_rep*wt_rep
-            deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,a.shape[1],-1).float()
-
-            G = H/deno
-
-            R_previous[output_node] = (torch.matmul(G, R_list[output_node].reshape(R_list[output_node].shape[0],R_list[output_node].shape[1],1).float()))
-            R_previous[output_node] = R_previous[output_node].reshape(R_previous[output_node].shape[0], R_previous[output_node].shape[1])
-
-            print('- Finished computing R-scores for output neuron #: ', output_node+1)
-
-        print(f'- Completed layer: {layer}')
-        if (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1))):
-            print('- R score is conserved up to relative tolerance 1e-5')
-        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-4)):
-            print('- R score is conserved up to relative tolerance 1e-4')
-        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-3)):
-            print('- R score is conserved up to relative tolerance 1e-3')
-        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-2)):
-            print('- R score is conserved up to relative tolerance 1e-2')
-        elif (torch.allclose(R_previous[output_node].sum(axis=1), R_list[output_node].sum(axis=1), rtol=1e-1)):
-            print('- R score is conserved up to relative tolerance 1e-1')
-
-        return R_previous
-
-    """
-    explanation functions
-    """
-
-    def explain(self,
-                to_explain:dict,
-                save:bool=True,
-                save_to:str="./relevance.pt",
-                sort_nodes_by:int=0,
-                signal=torch.tensor([1,0,0,0,0,0],dtype=torch.float32).to(device),
-                return_result:bool=False):
-
-        start_index = self.model.n_layers                  ##########################
-        print('Total number of layers (including activation layers):', start_index)
-
-        ### loop over each single layer
-        for index in range(start_index+1, 1, -1):
-            print(f"Explaining layer {1+start_index+1-index}/{start_index+1-1}")
-            if index==start_index+1:
-                R = self.explain_single_layer(to_explain["pred"], to_explain, start_index+1, index)
-            else:
-                R = self.explain_single_layer(R, to_explain, start_index+1, index)
-
-            with open(to_explain["outpath"]+'/'+to_explain["load_model"]+f'/R_score_layer{index}.pkl', 'wb') as f:
-                cPickle.dump(R, f, protocol=4)
-
-        print("Finished explaining all layers.")
-
-    def explain_single_layer(self, R, to_explain, output_layer_index, index=None,name=None):
-
-        # preparing variables required for computing LRP
-        layer=self.model.get_layer(index=index,name=name)
-
-        if name is None:
-            name=self.model.index2name(index)
-        if index is None:
-            index=self.model.name2index(name)
-
-        input=to_explain['A'][name]
-
-        if index==output_layer_index:
-            output_layer_bool=True
-        else:
-            output_layer_bool=False
-
-        # backward pass with specified LRP rule
-        if 'Linear' in str(layer):
-            R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False)
-        elif 'LeakyReLU' or 'ELU' in str(layer):
-            R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=True)
-
-        return R
-
-def copy_tensor(tensor,dtype=torch.float32):
-    """
-    create a deep copy of the provided tensor,
-    outputs the copy with specified dtype
-    """
-
-    return tensor.clone().detach().requires_grad_(True).to(device)
-
-
-##-----------------------------------------------------------------------------
-#
-# arep=torch.transpose(a[0].repeat(6, 1),0,1)   # repeat it 6 times
-# H=arep*wt
-#
-# G = H/H.sum(axis=0).float()
-#
-# Num = torch.matmul(G, R[0].float())
-#
-# print('Num.sum()', Num.sum())
-#
-# print(R[0].sum())
diff --git a/mlpf/pytorch_delphes/LRP/main_clf.py b/mlpf/pytorch_delphes/LRP/main_clf.py
deleted file mode 100644
index e22d316cd..000000000
--- a/mlpf/pytorch_delphes/LRP/main_clf.py
+++ /dev/null
@@ -1,360 +0,0 @@
-from glob import glob
-import sys, os
-import os.path as osp
-import pickle as pkl
-import _pickle as cPickle
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib, mplhep
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-    print("GPU model:", torch.cuda.get_device_name(0))
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-import networkx as nx
-from torch_geometric.utils.convert import to_networkx
-from torch_geometric.utils import to_dense_adj
-
-sys.path.insert(1, '../')
-sys.path.insert(1, '../../../plotting/')
-sys.path.insert(1, '../../../mlpf/plotting/')
-
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-
-from model_LRP_reg import PFNet7
-from LRP_clf_gpu import LRP_clf
-from model_io import model_io
-
-import networkx as nx
-from torch_geometric.utils.convert import to_networkx
-
-# NOTE: this script works by loading an already trained model
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        task,
-        title)
-    return model_fname
-
-def map_classid_to_classname(id):
-    if id==0:
-        return 'null'
-    if id==1:
-        return 'charged hadron'
-    if id==2:
-        return 'neutral hadron'
-    if id==3:
-        return 'photon'
-    if id==4:
-        return 'electron'
-    if id==5:
-        return 'muon'
-
-if __name__ == "__main__":
-
-    # args = parse_args()
-
-    # the next part initializes some args values (to run the script not from terminal)
-    class objectview(object):
-        def __init__(self, d):
-            self.__dict__ = d
-
-    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
-    # 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
-    # 'load_epoch': 9, 'load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'classification_only': True, 'nn1': True, 'conv2': False, 'nn3': False, 'title': '',
-    # 'explain': True, 'load': False, 'make_heatmaps': False})
-
-    args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
-    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../../test_tmp_delphes/data/pythia8_qcd',
-    'outpath': '../../../../test_tmp_delphes/experiments/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
-    'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
-    'load_epoch': 14, 'load_model': 'LRP_clf_PFNet7_gen_ntrain_1_nepochs_15_batch_size_1_lr_0.001_alpha_0.0002_clf_noskip_nn1',
-    'classification_only': True, 'nn1': True, 'conv2': False, 'nn3': False, 'title': '',
-    'explain': False, 'load': True, 'make_heatmaps': True})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.dataset)
-    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'space_dim': args.space_dim,
-                    'propagate_dimensions': args.propagate_dimensions,
-                    'nearest': args.nearest}
-
-    print('Loading a previously trained model..')
-    model = model_class(**model_kwargs)
-    outpath = args.outpath + args.load_model
-    PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
-
-    state_dict = torch.load(PATH, map_location=device)
-
-    # if model was trained using DataParallel then we have to load it differently
-    if "DataParallel" in args.load_model:
-        state_dict = torch.load(PATH, map_location=device)
-        from collections import OrderedDict
-        new_state_dict = OrderedDict()
-        for k, v in state_dict.items():
-            name = k[7:] # remove module.
-            new_state_dict[name] = v
-            # print('name is:', name)
-        state_dict=new_state_dict
-
-    model.load_state_dict(state_dict)
-    model.to(device)
-
-    if args.explain:
-        model.eval()
-        print(model)
-
-        # create some hooks to retrieve intermediate activations
-        activation = {}
-        hooks={}
-
-        def get_activation(name):
-            def hook(model, input, output):
-                activation[name] = input[0]
-            return hook
-
-        for name, module in model.named_modules():
-            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
-                hooks[name] = module.register_forward_hook(get_activation("." + name))
-
-        for i, batch in enumerate(train_loader):
-
-            if multi_gpu:
-                X = batch
-            else:
-                X = batch.to(device)
-
-            if i==0:
-                # code can be written better
-                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
-                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
-                model = model_io(model,state_dict,dict(),activation)
-                explainer = LRP_clf(model)
-
-            else:
-                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
-
-            to_explain = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
-                         "y": gen_ids_one_hot, "pred": pred_ids_one_hot,
-                         "edge_index": edge_index, "edge_weight": edge_weight, "after_message": after_message, "before_message": before_message,
-                         "outpath": args.outpath, "load_model": args.load_model}
-
-            model.set_dest(to_explain["A"])
-
-            big_list = explainer.explain(to_explain)
-
-            torch.save(big_list, outpath + f'/big_list.pt')
-            torch.save(to_explain, outpath + f'/to_explain.pt')
-
-            break # explain only one single event
-
-    elif args.load:
-
-        big_list = torch.load(outpath + f'/big_list.pt', map_location=device)
-        to_explain = torch.load(outpath + f'/to_explain.pt', map_location=device)
-
-        gen_ids_one_hot = to_explain["y"]
-        pred_ids_one_hot = to_explain["pred"]
-        X = to_explain["inputs"]
-
-    if args.make_heatmaps:
-        # make directories to hold the heatmaps
-        print('Making heatmaps..')
-        for i in range(6):
-            if not osp.isdir(outpath + f'/class{str(i)}'):
-                os.makedirs(outpath + f'/class{str(i)}')
-            for j in range(6):
-                if not osp.isdir(outpath + f'/class{str(i)}'+f'/pid{str(j)}'):
-                    os.makedirs(outpath + f'/class{str(i)}'+f'/pid{str(j)}')
-
-        # make heatmaps
-        pred_ids = pred_ids_one_hot.argmax(axis=1)
-        gen_ids = gen_ids_one_hot.argmax(axis=1)
-
-        for output_neuron in range(output_dim_id):
-            list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
-            dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
-
-            for i,id in enumerate(gen_ids):
-                R_cat_feat_cat_pred = torch.cat([big_list[i][output_neuron].to(device), X['x'].to(device), pred_ids_one_hot.to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
-                if id==0:
-                    list0.append(R_cat_feat_cat_pred)
-                    dist0.append(i)
-                if id==1:
-                    list1.append(R_cat_feat_cat_pred)
-                    dist1.append(i)
-                if id==2:
-                    list2.append(R_cat_feat_cat_pred)
-                    dist2.append(i)
-                if id==3:
-                    list3.append(R_cat_feat_cat_pred)
-                    dist3.append(i)
-                if id==4:
-                    list4.append(R_cat_feat_cat_pred)
-                    dist4.append(i)
-                if id==5:
-                    list5.append(R_cat_feat_cat_pred)
-                    dist5.append(i)
-
-            list=[list0,list1,list2,list3,list4,list5]
-            dist=[dist0,dist1,dist2,dist3,dist4,dist5]
-
-            for pid in range(6):
-                for j in range(len(list[pid])): # iterating over the nodes in a graph
-                    # to keep non-zero rows
-                    non_empty_mask = list[pid][j][:,:12].abs().sum(dim=1).bool()
-                    harvest = list[pid][j][non_empty_mask,:]
-                    pos = dist[pid][j]
-
-                    def make_list(t):
-                        l = []
-                        for elem in t:
-                            if elem==1:
-                                l.append('cluster')
-                            if elem==2:
-                                l.append('track')
-                        return l
-
-                    node_types = make_list(harvest[:,12])
-
-                    ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
-                    if pid==1:
-                        features = ["type", " pt", "eta",
-                               "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
-                    else:
-                        features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "padding", "padding", "padding", "padding"]
-
-
-                    fig, ax = plt.subplots()
-                    fig.tight_layout()
-                    if pid==0:
-                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true null')
-                    if pid==1:
-                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true charged hadron')
-                    if pid==2:
-                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true neutral hadron')
-                    if pid==3:
-                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true photon')
-                    if pid==4:
-                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true electron')
-                    if pid==5:
-                        ax.set_title('Heatmap for the "'+map_classid_to_classname(output_neuron)+'" prediction of a true muon')
-                    ax.set_xticks(np.arange(len(features)))
-                    ax.set_yticks(np.arange(len(node_types)))
-                    for col in range(len(features)):
-                        for row in range(len(node_types)):
-                            text = ax.text(col, row, round(harvest[row,12+col].item(),2),
-                                           ha="center", va="center", color="w")
-                    # ... and label them with the respective list entries
-                    ax.set_xticklabels(features)
-                    ax.set_yticklabels(node_types)
-                    plt.xlabel("\noutput prediction:{R} \nposition of node is row # {harvest}".format(R=[round(num,2) for num in harvest[j, 24:30].tolist()], harvest=((harvest[:,30] == pos).nonzero(as_tuple=True)[0].item()+1)))
-                    plt.imshow(torch.abs(harvest[:,:12]*10**7).detach().cpu().numpy(), interpolation="nearest", cmap='copper')
-                    plt.colorbar()
-                    fig.set_size_inches(19, 10)
-                    plt.savefig(outpath + f'/class{str(output_neuron)}'+f'/pid{str(pid)}'+f'/sample{str(j)}.jpg')
-                    plt.close(fig)
-
-                    if j==2:
-                        break
-
-
-# # ------------------------------------------------------------------------------------------------
-# # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:
-# print(R16[0].sum(axis=1)[0])
-# print(R15[0].sum(axis=1)[0])
-# print(R14[0].sum(axis=1)[0])
-# print(R13[0].sum(axis=1)[0])
-# print(R13[0].sum(axis=1)[0])
-# print(R12[0].sum(axis=1)[0])
-# print(R11[0].sum(axis=1)[0])
-# print(R10[0].sum(axis=1)[0])
-# print(R9[0].sum(axis=1)[0])
-# print(R8[0].sum(axis=1)[0])
-# print(R_score_layer_before_msg_passing[0][0].sum(axis=0).sum())
-# print(R7[0][0].sum(axis=0).sum())
-# print(R6[0][0].sum(axis=1).sum())
-# print(R5[0][0].sum(axis=1).sum())
-# print(R4[0][0].sum(axis=1).sum())
-# print(R3[0][0].sum(axis=1).sum())
-# print(R2[0][0].sum(axis=1).sum())
-# print(R1[0][0].sum(axis=1).sum())
diff --git a/mlpf/pytorch_delphes/LRP/main_dnn.py b/mlpf/pytorch_delphes/LRP/main_dnn.py
deleted file mode 100644
index fee51e022..000000000
--- a/mlpf/pytorch_delphes/LRP/main_dnn.py
+++ /dev/null
@@ -1,227 +0,0 @@
-from glob import glob
-import sys, os
-import os.path as osp
-import pickle as pkl
-import _pickle as cPickle
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib, mplhep
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-
-sys.path.insert(1, '../')
-sys.path.insert(1, '../../../plotting/')
-sys.path.insert(1, '../../../mlpf/plotting/')
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-
-from plot_utils import plot_confusion_matrix
-from model_LRP_dnn import PFNet7
-
-from LRP_dnn import LRP
-from model_io import model_io
-
-import networkx as nx
-from torch_geometric.utils.convert import to_networkx
-from tabulate import tabulate
-
-# NOTE: this script works by loading an already trained model
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        task,
-        title)
-    return model_fname
-
-def map_classid_to_classname(id):
-    if id==0:
-        return 'null'
-    if id==1:
-        return 'charged hadron'
-    if id==2:
-        return 'neutral hadron'
-    if id==3:
-        return 'photon'
-    if id==4:
-        return 'electron'
-    if id==5:
-        return 'muon'
-
-if __name__ == "__main__":
-
-    # args = parse_args()
-
-    # the next part initializes some args values (to run the script not from terminal)
-    class objectview(object):
-        def __init__(self, d):
-            self.__dict__ = d
-
-    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 10, 'patience': 100, 'hidden_dim':256, 'hidden_dim_nn1':64, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4, 'dropout': 0.3,
-    # 'space_dim': 8, 'propagate_dimensions': 22, 'nearest': 40, 'overwrite': True,
-    # 'load': True, 'load_epoch': 0, 'load_model': 'LRP_DNN_PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4',
-    # 'evaluate': False, 'evaluate_on_cpu': False, 'classification_only': False, 'nn1': True, 'nn3': True, 'nn4': True, 'title': 'dnn', 'explain': True})
-
-    args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 10, 'patience': 100, 'hidden_dim':256, 'hidden_dim_nn1':64, 'input_encoding': 12, 'encoding_dim': 64,
-    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../../../test_tmp_delphes/data/pythia8_qcd',
-    'outpath': '../../../../test_tmp_delphes/experiments/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4, 'dropout': 0.3,
-    'space_dim': 8, 'propagate_dimensions': 22, 'nearest': 40, 'overwrite': True,
-    'load': True, 'load_epoch': 0, 'load_model': 'LRP_DNN_PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4',
-    'evaluate': False, 'evaluate_on_cpu': False, 'classification_only': False, 'nn1': True, 'nn3': True, 'nn4': True, 'title': 'dnn', 'explain': True})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.dataset)
-    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4}
-
-    print('Loading a previously trained model..')
-    model = model_class(**model_kwargs)
-    outpath = args.outpath + args.load_model
-    PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
-
-    state_dict = torch.load(PATH, map_location=device)
-
-    # if model was trained using DataParallel then we have to load it differently
-    if "DataParallel" in args.load_model:
-        state_dict = torch.load(PATH, map_location=device)
-        from collections import OrderedDict
-        new_state_dict = OrderedDict()
-        for k, v in state_dict.items():
-            name = k[7:] # remove module.
-            new_state_dict[name] = v
-            # print('name is:', name)
-        state_dict=new_state_dict
-
-    model.load_state_dict(state_dict)
-
-    if args.explain:
-        model.eval()
-        print(model)
-
-        signal =torch.tensor([1,0,0,0,0,0],dtype=torch.float32).to(device)
-
-        # create some hooks to retrieve intermediate activations
-        activation = {}
-        hooks={}
-
-        def get_activation(name):
-            def hook(model, input, output):
-                activation[name] = input[0]
-            return hook
-
-        for name, module in model.named_modules():
-            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
-                hooks[name] = module.register_forward_hook(get_activation("." + name))
-            print(name)
-
-        for i, batch in enumerate(train_loader):
-            t0 = time.time()
-
-            if multi_gpu:
-                X = batch
-            else:
-                X = batch.to(device)
-
-            if i==0:
-                # code can be written better
-                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
-                pred_ids_one_hot, pred_p4, target_ids_one_hot, target_p4, cand_ids_one_hot, cand_p4 = model(X)
-                model=model_io(model,state_dict,dict(),activation)
-                explainer=LRP(model)
-
-            else:
-                pred_ids_one_hot, pred_p4, target_ids_one_hot, target_p4, cand_ids_one_hot, cand_p4 = model.model(X)
-
-            to_explain={"A":activation,"inputs":dict(x=X.x,
-                                                batch=X.batch),"y":target_ids_one_hot,"R":dict(), "pred":pred_ids_one_hot,
-                                                "outpath":args.outpath, "load_model":args.load_model}
-
-            model.set_dest(to_explain["A"])
-
-            explainer.explain(to_explain,save=False,return_result=True, signal=signal)
-
-            break
-
-## -----------------------------------------------------------
-# # to retrieve a stored variable in pkl file
-# import _pickle as cPickle
-# with open('../../../prp/models/LRP/LRP_DNN_PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4/R_scorez.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
-#     s = cPickle.load(f)
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_clf.py b/mlpf/pytorch_delphes/LRP/model_LRP_clf.py
deleted file mode 100644
index 997ba8d65..000000000
--- a/mlpf/pytorch_delphes/LRP/model_LRP_clf.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import numpy as np
-import mplhep
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
-
-#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
-from gravnet_LRP import GravNetConv
-from torch_geometric.nn import GraphConv
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
-        output_dim_id=6,
-        output_dim_p4=6,
-        space_dim=4, propagate_dimensions=22, nearest=16):
-
-        super(PFNet7, self).__init__()
-
-        self.target = target
-        self.nn1 = nn1
-
-        self.act = nn.LeakyReLU
-        self.act_f = torch.nn.functional.leaky_relu
-        self.elu = nn.ELU
-
-        # (1) DNN
-        self.nn1 = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim_nn1),
-            self.elu(),
-            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-            self.elu(),
-            nn.Linear(hidden_dim_nn1, input_encoding),
-        )
-
-        # (2) CNN: Gravnet layer
-        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
-
-        # (3) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(encoding_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-    def forward(self, data):
-
-        x = data.x
-
-        # Encoder/Decoder step
-        x = self.nn1(x)
-
-        # Gravnet step
-        x, edge_index, edge_weight, after_message, before_message = self.conv1(x)
-        x = self.act_f(x)                 # act by nonlinearity
-
-        # DNN to predict PID (after a dropout)
-        pred_ids = self.nn2(x)
-
-        pred_p4 = torch.zeros_like(data.ycand)
-
-        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand,  edge_index, edge_weight, after_message, before_message
-# -------------------------------------------------------------------------------------
-# # uncomment to test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import data_to_loader_ttbar
-# from data_preprocessing import data_to_loader_qcd
-#
-# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
-#
-# print('Input to the network:', next(iter(train_loader)))
-#
-# model = PFNet7()
-#
-# for batch in train_loader:
-#     pred_ids, pred_p4, target_ids, target_p4 = model(batch)
-#     pred_ids
-#     print('Predicted PID:', pred_ids)
-#     print('Predicted p4:', pred_p4)
-#     break
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_dnn.py b/mlpf/pytorch_delphes/LRP/model_LRP_dnn.py
deleted file mode 100644
index f95cea3e2..000000000
--- a/mlpf/pytorch_delphes/LRP/model_LRP_dnn.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import numpy as np
-import mplhep
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=256,
-        output_dim_id=6,
-        output_dim_p4=6):
-
-        super(PFNet7, self).__init__()
-
-        self.elu = nn.ELU
-
-        # (1) DNN
-        self.nn1 = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, output_dim_id)
-            )
-
-    def forward(self, data):
-
-        x0 = data.x
-
-        pred_ids = self.nn1(x0)
-
-        pred_p4 = torch.zeros_like(data.ycand)
-
-        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
-
-# -------------------------------------------------------------------------------------
-# # uncomment to test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import data_to_loader_ttbar
-# from data_preprocessing import data_to_loader_qcd
-#
-# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
-#
-# print('Input to the network:', next(iter(train_loader)))
-#
-# model = PFNet7()
-#
-# for batch in train_loader:
-#     pred_ids, pred_p4, target_ids, target_p4, cand_ids, cand_p4 = model(batch)
-#     break
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_reg.py b/mlpf/pytorch_delphes/LRP/model_LRP_reg.py
index c1140daad..b44b3255f 100644
--- a/mlpf/pytorch_delphes/LRP/model_LRP_reg.py
+++ b/mlpf/pytorch_delphes/LRP/model_LRP_reg.py
@@ -86,24 +86,4 @@ def forward(self, data):
         nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
         pred_p4 = self.nn3(nn3_input)
 
-        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand,  edge_index, edge_weight, after_message, before_message
-# -------------------------------------------------------------------------------------
-# # uncomment to test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import data_to_loader_ttbar
-# from data_preprocessing import data_to_loader_qcd
-#
-# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
-#
-# print('Input to the network:', next(iter(train_loader)))
-#
-# model = PFNet7()
-#
-# for batch in train_loader:
-#     pred_ids, pred_p4, target_ids, target_p4 = model(batch)
-#     pred_ids
-#     print('Predicted PID:', pred_ids)
-#     print('Predicted p4:', pred_p4)
-#     break
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand, edge_index, edge_weight, after_message, before_message
diff --git a/mlpf/pytorch_delphes/model.py b/mlpf/pytorch_delphes/model.py
index 1bfb7aba8..44814f32b 100644
--- a/mlpf/pytorch_delphes/model.py
+++ b/mlpf/pytorch_delphes/model.py
@@ -99,5 +99,3 @@ def forward(self, data):
             pred_p4 = torch.zeros_like(data.ycand)
 
         return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
-
-# # -------------------------------------------------------------------------------------
diff --git a/mlpf/pytorch_delphes/model_dnn.py b/mlpf/pytorch_delphes/model_dnn.py
deleted file mode 100644
index 338220112..000000000
--- a/mlpf/pytorch_delphes/model_dnn.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import numpy as np
-import mplhep
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
-
-#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
-from gravnet import GravNetConv
-from torch_geometric.nn import GraphConv
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
-        output_dim_id=6,
-        output_dim_p4=6,
-        space_dim=8, propagate_dimensions=22, nearest=40,
-        target="gen", nn1=True, nn3=True, nn4=True):
-
-        super(PFNet7, self).__init__()
-
-        self.target = target
-        self.nn1 = nn1
-        self.nn3 = nn3
-        self.nn4 = nn4
-
-        self.act = nn.LeakyReLU
-        self.act_f = torch.nn.functional.leaky_relu
-        self.act_tanh = torch.nn.Tanh
-
-        # (1) DNN
-        self.nn1 = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim_nn1),
-            self.act(0.5),
-            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-            self.act(0.5),
-            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-            self.act(0.5),
-        )
-
-        self.nn2 = nn.Sequential(
-            nn.Linear(hidden_dim_nn1 + input_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (5) DNN layer: regressing p4
-        self.nn3 = nn.Sequential(
-            nn.Linear(encoding_dim + output_dim_id + input_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-        )
-
-        self.nn4 = nn.Sequential(
-            nn.Linear(hidden_dim + output_dim_id + input_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.act(0.5),
-            nn.Linear(hidden_dim, output_dim_p4),
-        )
-
-    def forward(self, data):
-
-        x0 = data.x
-
-        # Encoder/Decoder step
-        if self.nn1:
-            x = self.nn1(x0)
-
-        # DNN to predict PID
-        pred_ids = self.nn2(torch.cat([x, x0], axis=-1))
-
-        # DNN to predict p4
-        if self.nn3:
-            nn3_input = torch.cat([x, pred_ids, x0], axis=-1)
-            pred_p4 = self.nn3(nn3_input)
-        else:
-            pred_p4=torch.zeros_like(data.ycand)
-
-        if self.nn4:
-            nn3_input = torch.cat([pred_p4, pred_ids, x0], axis=-1)
-            pred_p4 = self.nn4(nn3_input)
-        else:
-            pred_p4=torch.zeros_like(data.ycand)
-
-        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
-
-# -------------------------------------------------------------------------------------
-# # uncomment to test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import data_to_loader_ttbar
-# from data_preprocessing import data_to_loader_qcd
-#
-# full_dataset = PFGraphDataset('../../test_tmp_delphes/data/pythia8_ttbar')
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
-#
-# print('Input to the network:', next(iter(train_loader)))
-#
-# model = PFNet7()
-#
-# for batch in train_loader:
-#     pred_ids, pred_p4, target_ids, target_p4, cand_ids, cand_p4 = model(batch)
-#     break
diff --git a/mlpf/pytorch_delphes/model_embeddings.py b/mlpf/pytorch_delphes/model_embeddings.py
deleted file mode 100644
index 042016083..000000000
--- a/mlpf/pytorch_delphes/model_embeddings.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import numpy as np
-import mplhep
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
-
-#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
-from gravnet import GravNetConv
-from torch_geometric.nn import GraphConv
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64, embedding_dim=3, encoding_of_clusters=True,
-        output_dim_id=6,
-        output_dim_p4=6,
-        space_dim=4, propagate_dimensions=22, nearest=16,
-        target="gen", nn1=True, nn3=True, nn0track=True, nn0cluster=True):
-
-        super(PFNet7, self).__init__()
-
-        self.target = target
-        self.nn1 = nn1
-        self.nn3 = nn3
-        self.nn0track = nn0track
-        self.nn0cluster = nn0cluster
-        self.embedding_dim = embedding_dim
-        self.encoding_of_clusters = encoding_of_clusters
-
-        self.act = nn.LeakyReLU
-        self.act_f = torch.nn.functional.leaky_relu
-        self.act_tanh = torch.nn.Tanh
-        self.elu = nn.ELU
-
-        # (0) DNN: encode the tracks from 12d -> 12d
-        if self.nn0track:
-            self.nn0track = nn.Sequential(
-                nn.Linear(12-1, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, input_encoding-1),
-            )
-
-        # (0) DNN: encode the clusters from 8d -> 12d
-        if self.nn0cluster:
-            self.nn0cluster = nn.Sequential(
-                nn.Linear(8-1, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, input_encoding-1),
-            )
-
-        # (0) DNN: embedding of "type"
-        if self.embedding_dim:
-            self.embedding = nn.Embedding(embedding_dim, 1)
-
-        # (1) DNN: encoding/decoding of all tracks and clusters
-        if self.nn1:
-            self.nn1 = nn.Sequential(
-                nn.Linear(input_dim, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, input_encoding),
-            )
-
-        # (2) CNN: Gravnet layer
-        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
-
-        # (3) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(encoding_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (4) DNN layer: regressing p4
-        if self.nn3:
-            self.nn3 = nn.Sequential(
-                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
-                self.elu(),
-                nn.Linear(hidden_dim, hidden_dim),
-                self.elu(),
-                nn.Linear(hidden_dim, hidden_dim),
-                self.elu(),
-                nn.Linear(hidden_dim, output_dim_p4),
-            )
-
-    def forward(self, data):
-        x0 = data.x
-
-        if self.encoding_of_clusters:
-        # encode the clusters onto a non-padded 12d features.. encode the tracks as well for equivalence
-            tracks = x0[:,1:][x0[:,0]==2]
-            clusters = x0[:,1:8][x0[:,0]==1]
-
-            if self.nn0track:
-                tracks = self.nn0track(tracks)
-
-            if self.nn0cluster:
-                clusters = self.nn0cluster(clusters)
-
-            tracks=torch.cat([x0[:,0][x0[:,0]==2].reshape(-1,1),tracks], axis=1)
-            clusters=torch.cat([x0[:,0][x0[:,0]==1].reshape(-1,1),clusters], axis=1)
-
-            x0 = torch.cat([tracks,clusters])
-
-        if self.embedding_dim:
-            # embed the "type" feature
-            add = self.embedding(x0[:,0].long()).reshape(-1,1)
-            x0=torch.cat([add,x0[:,1:]], axis=1)
-
-        # Encoder/Decoder step
-        if self.nn1:
-            x = self.nn1(x0)
-        else:
-            x=x0
-
-        # Gravnet step
-        x, edge_index, edge_weight = self.conv1(x)
-        x = self.act_f(x)                 # act by nonlinearity
-
-        # DNN to predict PID
-        pred_ids = self.nn2(x)
-
-        # DNN to predict p4
-        if self.nn3:
-            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
-            pred_p4 = self.nn3(nn3_input)
-        else:
-            pred_p4 = torch.zeros_like(data.ycand)
-
-        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
-
-# # -------------------------------------------------------------------------------------
-# # uncomment to test a forward pass
-# from graph_data_delphes import PFGraphDataset
-# from data_preprocessing import data_to_loader_ttbar
-# from data_preprocessing import data_to_loader_qcd
-#
-# full_dataset = PFGraphDataset('../../../test_tmp_delphes/data/pythia8_ttbar')
-#
-# train_loader, valid_loader = data_to_loader_ttbar(full_dataset, n_train=2, n_valid=1, batch_size=2)
-#
-# model = PFNet7()
-# model.to(device)
-#
-# for batch in train_loader:
-#     X = batch.to(device)
-#     pred_ids, pred_p4, gen_ids, gen_p4, cand_ids, cand_p4 = model(X)
-#     break
diff --git a/mlpf/pytorch_delphes/training.py b/mlpf/pytorch_delphes/pipeline.py
similarity index 100%
rename from mlpf/pytorch_delphes/training.py
rename to mlpf/pytorch_delphes/pipeline.py
diff --git a/mlpf/pytorch_delphes/training_dnn.py b/mlpf/pytorch_delphes/training_dnn.py
deleted file mode 100644
index 7d5316bb3..000000000
--- a/mlpf/pytorch_delphes/training_dnn.py
+++ /dev/null
@@ -1,497 +0,0 @@
-from glob import glob
-import sys, os
-sys.path.insert(1, '../../plotting/')
-sys.path.insert(1, '../../mlpf/plotting/')
-
-import os.path as osp
-import pickle as pkl
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import mplhep as hep
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-import evaluate
-from evaluate import make_plots, make_predictions
-from plot_utils import plot_confusion_matrix
-from model_dnn import PFNet7
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        alpha,
-        task,
-        title)
-    return model_fname
-
-def compute_weights(target_ids_one_hot, device):
-    vs, cs = torch.unique(target_ids_one_hot, return_counts=True)
-    weights = torch.zeros(output_dim_id).to(device=device)
-    for k, v in zip(vs, cs):
-        weights[k] = 1.0/math.sqrt(float(v))
-    return weights
-
-def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
-    if not os.path.exists(outpath + '/training_plots/'):
-        os.makedirs(outpath + '/training_plots/')
-
-    fig, ax = plt.subplots()
-    ax.plot(range(len(l)), l, label=label)
-    ax.set_xlabel(xlabel)
-    ax.set_ylabel(ylabel)
-    ax.legend(loc='best')
-    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
-    plt.close(fig)
-
-    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
-        pkl.dump(l, f)
-
-@torch.no_grad()
-def test(model, loader, epoch, alpha, target_type, device):
-    with torch.no_grad():
-        ret = train(model, loader, epoch, None, alpha, target_type, device)
-    return ret
-
-def train(model, loader, epoch, optimizer, alpha, target_type, device):
-
-    is_train = not (optimizer is None)
-
-    if is_train:
-        model.train()
-    else:
-        model.eval()
-
-    #loss values for each batch: classification, regression, total
-    losses_1, losses_2, losses_tot = [], [], []
-
-    #accuracy values for each batch (monitor classification performance)
-    accuracies_batch, accuracies_batch_msk = [], []
-
-    #setup confusion matrix
-    conf_matrix = np.zeros((output_dim_id, output_dim_id))
-
-    # to compute average inference time
-    t=[]
-
-    for i, batch in enumerate(loader):
-        t0 = time.time()
-
-        if multi_gpu:
-            X = batch
-        else:
-            X = batch.to(device)
-
-        ## make like tensorflow model, 0-padding events to 6k elements
-        # if X.x.shape[0]<6000:
-        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
-        #
-        #     X.x = new_X
-        #     X.ygen_id=new_ygen_id
-
-        # Forwardprop
-        if i<10:
-            ti = time.time()
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-            tf = time.time()
-            t.append(round((tf-ti),2))
-        else:
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-
-        _, gen_ids = torch.max(gen_ids_one_hot, -1)
-        _, pred_ids = torch.max(pred_ids_one_hot, -1)
-        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
-
-        # masking
-        msk = ((pred_ids != 0) & (gen_ids != 0))
-        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
-
-        # computing loss
-        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
-        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
-        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
-
-        if args.classification_only:
-            loss = l1
-        else:
-            loss = l1+l2
-
-        losses_1.append(l1.item())
-        losses_2.append(l2.item())
-        losses_tot.append(loss.item())
-
-        if is_train:
-            # BACKPROP
-            #print(list(model.parameters())[1].grad)
-            a = list(model.parameters())[1].clone()
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            b = list(model.parameters())[1].clone()
-            if torch.equal(a.data, b.data):
-                print('Model is not learning.. weights are not updating..')
-
-        t1 = time.time()
-
-        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
-        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
-
-        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
-                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
-
-        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.item(), t1-t0), end='\r', flush=True)
-
-    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
-
-    losses_1 = np.mean(losses_1)
-    losses_2 = np.mean(losses_2)
-    losses_tot = np.mean(losses_tot)
-
-    acc = np.mean(accuracies_batch)
-    acc_msk = np.mean(accuracies_batch_msk)
-
-    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
-
-    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
-
-
-def train_loop():
-    t0_initial = time.time()
-
-    losses_1_train, losses_2_train, losses_tot_train = [], [], []
-    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
-
-    accuracies_train, accuracies_msk_train = [], []
-    accuracies_valid, accuracies_msk_valid = [], []
-
-    best_val_loss = 99999.9
-    stale_epochs = 0
-
-    print("Training over {} epochs".format(args.n_epochs))
-    for epoch in range(args.n_epochs):
-        t0 = time.time()
-
-        if stale_epochs > patience:
-            print("breaking due to stale epochs")
-            break
-
-        # training epoch
-        model.train()
-        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
-
-        losses_tot_train.append(losses_tot)
-        losses_1_train.append(losses_1)
-        losses_2_train.append(losses_2)
-
-        accuracies_train.append(acc)
-        accuracies_msk_train.append(acc_msk)
-
-        # validation step
-        model.eval()
-        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
-
-        losses_tot_valid.append(losses_tot_v)
-        losses_1_valid.append(losses_1_v)
-        losses_2_valid.append(losses_2_v)
-
-        accuracies_valid.append(acc_v)
-        accuracies_msk_valid.append(acc_msk_v)
-
-        # early-stopping
-        if losses_tot_v < best_val_loss:
-            best_val_loss = losses_tot_v
-            stale_epochs = 0
-        else:
-            stale_epochs += 1
-
-        t1 = time.time()
-
-        epochs_remaining = args.n_epochs - (epoch+1)
-        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
-        eta = epochs_remaining*time_per_epoch/60
-
-        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
-            epoch+1, args.n_epochs,
-            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
-            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
-
-        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
-
-        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
-        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
-
-        with open(outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pkl', 'wb') as f:
-            pkl.dump(conf_matrix_norm, f)
-
-        with open(outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl', 'wb') as f:
-            pkl.dump(conf_matrix_norm_v, f)
-
-    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
-    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
-    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
-
-    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
-    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
-    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
-
-    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
-    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
-
-    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
-    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
-
-    print('Done with training.')
-
-    return
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 1, 'patience': 100, 'hidden_dim':256, 'hidden_dim_nn1':64, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4, 'overwrite': True,
-    # 'load': False, 'load_epoch': 0, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_1_batch_size_1_lr_0.001_alpha_0.0002_both_dnnnoskip_nn1_nn3_nn4',
-    # 'classification_only': False, 'nn1': True, 'nn3': True, 'nn4': True, 'title': 'dnn',
-    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': True, 'make_plots_valid': True, 'make_predictions_test': True, 'make_plots_test': True})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.dataset)
-    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'hidden_dim_nn1': args.hidden_dim_nn1,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'target': args.target,
-                    'nn1': args.nn1,
-                    'nn3': args.nn3,
-                    'nn4': args.nn4}
-
-    if args.load:
-            print('Loading a previously trained model..')
-            model = model_class(**model_kwargs)
-            outpath = args.outpath + args.load_model
-            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
-
-            state_dict = torch.load(PATH, map_location=device)
-
-            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
-                state_dict = torch.load(PATH, map_location=device)
-                from collections import OrderedDict
-                new_state_dict = OrderedDict()
-                for k, v in state_dict.items():
-                    name = k[7:] # remove module.
-                    new_state_dict[name] = v
-                    # print('name is:', name)
-                state_dict=new_state_dict
-
-            model.load_state_dict(state_dict)
-
-            if args.train:
-                print("Training a previously trained model..")
-
-    elif args.train:
-        #instantiate the model
-        print('Instantiating a model..')
-        model = model_class(**model_kwargs)
-
-        if multi_gpu:
-            print("Parallelizing the training..")
-            model = torch_geometric.nn.DataParallel(model)
-            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-        model.to(device)
-
-    if args.train:
-        args.title=args.title+'noskip'
-        if args.nn1:
-            args.title=args.title+'_nn1'
-        if args.nn3:
-            args.title=args.title+'_nn3'
-        if args.nn4:
-            args.title=args.title+'_nn4'
-
-        if args.classification_only:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
-        else:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
-
-        outpath = osp.join(args.outpath, model_fname)
-        if osp.isdir(outpath):
-            if args.overwrite:
-                print("model output {} already exists, deleting it".format(outpath))
-                import shutil
-                shutil.rmtree(outpath)
-            else:
-                print("model output {} already exists, please delete it".format(outpath))
-                sys.exit(0)
-        try:
-            os.makedirs(outpath)
-        except Exception as e:
-            pass
-
-        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
-            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
-
-        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
-            os.makedirs(outpath + '/confusion_matrix_plots/')
-
-        if args.optimizer == "adam":
-            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-        elif args.optimizer == "adamw":
-            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-
-        print(model)
-        print(model_fname)
-
-        model.train()
-        train_loop()
-        model.eval()
-
-    # evaluate on training data..
-    if not osp.isdir(outpath+'/train_loader'):
-        os.makedirs(outpath+'/train_loader')
-    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
-        os.makedirs(outpath+'/train_loader/resolution_plots')
-    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
-        os.makedirs(outpath+'/train_loader/distribution_plots')
-    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/train_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
-        os.makedirs(outpath+'/train_loader/efficiency_plots')
-
-    if args.make_predictions_train:
-        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-    if args.make_plots_train:
-        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-
-    # evaluate on validation data..
-    if not osp.isdir(outpath+'/valid_loader'):
-        os.makedirs(outpath+'/valid_loader')
-    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
-        os.makedirs(outpath+'/valid_loader/resolution_plots')
-    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
-        os.makedirs(outpath+'/valid_loader/distribution_plots')
-    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
-        os.makedirs(outpath+'/valid_loader/efficiency_plots')
-
-    if args.make_predictions_valid:
-        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-    if args.make_plots_valid:
-        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-
-    # evaluate on testing data..
-    if not osp.isdir(outpath+'/test_loader'):
-        os.makedirs(outpath+'/test_loader')
-    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
-        os.makedirs(outpath+'/test_loader/resolution_plots')
-    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
-        os.makedirs(outpath+'/test_loader/distribution_plots')
-    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/test_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
-        os.makedirs(outpath+'/test_loader/efficiency_plots')
-
-    if args.make_predictions_test:
-        if args.load:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-    if args.make_plots_test:
-        if args.load:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-
-## -----------------------------------------------------------
-# # to retrieve a stored variable in pkl file
-# import pkl
-# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
-#     a = pkl.load(f)
diff --git a/mlpf/pytorch_delphes/training_embeddings.py b/mlpf/pytorch_delphes/training_embeddings.py
deleted file mode 100644
index 353473786..000000000
--- a/mlpf/pytorch_delphes/training_embeddings.py
+++ /dev/null
@@ -1,512 +0,0 @@
-from glob import glob
-import sys, os
-sys.path.insert(1, '../../plotting/')
-sys.path.insert(1, '../../mlpf/plotting/')
-
-import os.path as osp
-import pickle as pkl
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import mplhep as hep
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-    print("GPU model:", torch.cuda.get_device_name(0))
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-from plot_utils import plot_confusion_matrix
-
-import evaluate
-from evaluate import make_plots, make_predictions
-from model_embeddings import PFNet7
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        alpha,
-        task,
-        title)
-    return model_fname
-
-def compute_weights(gen_ids_one_hot, device):
-    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
-    weights = torch.zeros(output_dim_id).to(device=device)
-    for k, v in zip(vs, cs):
-        weights[k] = 1.0/math.sqrt(float(v))
-    return weights
-
-def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
-    plt.style.use(hep.style.ROOT)
-
-    if not os.path.exists(outpath + '/training_plots/'):
-        os.makedirs(outpath + '/training_plots/')
-
-    fig, ax = plt.subplots()
-    ax.plot(range(len(l)), l, label=label)
-    ax.set_xlabel(xlabel)
-    ax.set_ylabel(ylabel)
-    ax.legend(loc='best')
-    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
-    plt.close(fig)
-
-    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
-        pkl.dump(l, f)
-
-@torch.no_grad()
-def test(model, loader, epoch, alpha, target_type, device):
-    with torch.no_grad():
-        ret = train(model, loader, epoch, None, alpha, target_type, device)
-    return ret
-
-def train(model, loader, epoch, optimizer, alpha, target_type, device):
-
-    is_train = not (optimizer is None)
-
-    if is_train:
-        model.train()
-    else:
-        model.eval()
-
-    #loss values for each batch: classification, regression, total
-    losses_1, losses_2, losses_tot = [], [], []
-
-    #accuracy values for each batch (monitor classification performance)
-    accuracies_batch, accuracies_batch_msk = [], []
-
-    #setup confusion matrix
-    conf_matrix = np.zeros((output_dim_id, output_dim_id))
-
-    # to compute average inference time
-    t=[]
-
-    for i, batch in enumerate(loader):
-        t0 = time.time()
-
-        if multi_gpu:
-            X = batch
-        else:
-            X = batch.to(device)
-
-        ## make like tensorflow model, 0-padding events to 6k elements
-        # if X.x.shape[0]<6000:
-        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
-        #
-        #     X.x = new_X
-        #     X.ygen_id=new_ygen_id
-
-        # Forwardprop
-        if i<10:
-            ti = time.time()
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-            tf = time.time()
-            t.append(round((tf-ti),2))
-        else:
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-
-        _, gen_ids = torch.max(gen_ids_one_hot, -1)
-        _, pred_ids = torch.max(pred_ids_one_hot, -1)
-        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
-
-        # masking
-        msk = ((pred_ids != 0) & (gen_ids != 0))
-        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
-
-        # computing loss
-        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
-        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
-        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
-
-        if args.classification_only:
-            loss = l1
-        else:
-            loss = l1+l2
-
-        if is_train:
-            # BACKPROP
-            #print(list(model.parameters())[1].grad)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-        losses_1.append(l1.detach().cpu().item())
-        losses_2.append(l2.detach().cpu().item())
-        losses_tot.append(loss.detach().cpu().item())
-
-        t1 = time.time()
-
-        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
-        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
-
-        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
-                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
-
-        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
-
-    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
-
-    losses_1 = np.mean(losses_1)
-    losses_2 = np.mean(losses_2)
-    losses_tot = np.mean(losses_tot)
-
-    acc = np.mean(accuracies_batch)
-    acc_msk = np.mean(accuracies_batch_msk)
-
-    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
-
-    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
-
-
-def train_loop():
-    t0_initial = time.time()
-
-    losses_1_train, losses_2_train, losses_tot_train = [], [], []
-    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
-
-    accuracies_train, accuracies_msk_train = [], []
-    accuracies_valid, accuracies_msk_valid = [], []
-
-    best_val_loss = 99999.9
-    stale_epochs = 0
-
-    print("Training over {} epochs".format(args.n_epochs))
-    for epoch in range(args.n_epochs):
-        t0 = time.time()
-
-        if stale_epochs > patience:
-            print("breaking due to stale epochs")
-            break
-
-        # training epoch
-        model.train()
-        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
-
-        losses_tot_train.append(losses_tot)
-        losses_1_train.append(losses_1)
-        losses_2_train.append(losses_2)
-
-        accuracies_train.append(acc)
-        accuracies_msk_train.append(acc_msk)
-
-        # validation step
-        model.eval()
-        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
-
-        losses_tot_valid.append(losses_tot_v)
-        losses_1_valid.append(losses_1_v)
-        losses_2_valid.append(losses_2_v)
-
-        accuracies_valid.append(acc_v)
-        accuracies_msk_valid.append(acc_msk_v)
-
-        # early-stopping
-        if losses_tot_v < best_val_loss:
-            best_val_loss = losses_tot_v
-            stale_epochs = 0
-        else:
-            stale_epochs += 1
-
-        t1 = time.time()
-
-        epochs_remaining = args.n_epochs - (epoch+1)
-        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
-        eta = epochs_remaining*time_per_epoch/60
-
-        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
-            epoch+1, args.n_epochs,
-            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
-            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
-
-        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
-
-        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
-        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
-
-        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
-        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
-
-    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
-    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
-    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
-
-    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
-    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
-    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
-
-    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
-    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
-
-    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
-    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
-
-    print('Done with training.')
-
-    return
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
-    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
-    # 'load': False, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
-    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': True, 'make_plots_valid': True, 'make_predictions_test': True, 'make_plots_test': True})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.dataset)
-    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'hidden_dim_nn1': args.hidden_dim_nn1,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'embedding_dim': args.embedding_dim,
-                    'encoding_of_clusters': args.encoding_of_clusters,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'space_dim': args.space_dim,
-                    'propagate_dimensions': args.propagate_dimensions,
-                    'nearest': args.nearest,
-                    'target': args.target,
-                    'nn1': args.nn1,
-                    'nn3': args.nn3,
-                    'nn0track': args.nn0track,
-                    'nn0cluster': args.nn0cluster}
-
-    if args.load:
-            print('Loading a previously trained model..')
-            model = model_class(**model_kwargs)
-            outpath = args.outpath + args.load_model
-            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
-
-            state_dict = torch.load(PATH, map_location=device)
-
-            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
-                state_dict = torch.load(PATH, map_location=device)
-                from collections import OrderedDict
-                new_state_dict = OrderedDict()
-                for k, v in state_dict.items():
-                    name = k[7:] # remove module.
-                    new_state_dict[name] = v
-                    # print('name is:', name)
-                state_dict=new_state_dict
-
-            model.load_state_dict(state_dict)
-
-            if multi_gpu:
-                model = torch_geometric.nn.DataParallel(model)
-                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-            model.to(device)
-
-            if args.train:
-                print("Training a previously trained model..")
-
-    elif args.train:
-        #instantiate the model
-        print('Instantiating a model..')
-        model = model_class(**model_kwargs)
-
-        if multi_gpu:
-            print("Parallelizing the training..")
-            model = torch_geometric.nn.DataParallel(model)
-            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-        model.to(device)
-
-    if args.train:
-        args.title=args.title+'noskip'
-        if args.nn1:
-            args.title=args.title+'_nn1'
-        if args.nn3:
-            args.title=args.title+'_nn3'
-
-        if args.classification_only:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
-        else:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
-
-        outpath = osp.join(args.outpath, model_fname)
-        if osp.isdir(outpath):
-            if args.overwrite:
-                print("model output {} already exists, deleting it".format(outpath))
-                import shutil
-                shutil.rmtree(outpath)
-            else:
-                print("model output {} already exists, please delete it".format(outpath))
-                sys.exit(0)
-        try:
-            os.makedirs(outpath)
-        except Exception as e:
-            pass
-
-        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
-            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
-
-        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
-            os.makedirs(outpath + '/confusion_matrix_plots/')
-
-        if args.optimizer == "adam":
-            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-        elif args.optimizer == "adamw":
-            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-
-        print(model)
-        print(model_fname)
-
-        model.train()
-        train_loop()
-
-    model.eval()
-
-    # evaluate on training data..
-    if not osp.isdir(outpath+'/train_loader'):
-        os.makedirs(outpath+'/train_loader')
-    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
-        os.makedirs(outpath+'/train_loader/resolution_plots')
-    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
-        os.makedirs(outpath+'/train_loader/distribution_plots')
-    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/train_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
-        os.makedirs(outpath+'/train_loader/efficiency_plots')
-
-    if args.make_predictions_train:
-        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-    if args.make_plots_train:
-        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-
-    # evaluate on validation data..
-    if not osp.isdir(outpath+'/valid_loader'):
-        os.makedirs(outpath+'/valid_loader')
-    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
-        os.makedirs(outpath+'/valid_loader/resolution_plots')
-    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
-        os.makedirs(outpath+'/valid_loader/distribution_plots')
-    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
-        os.makedirs(outpath+'/valid_loader/efficiency_plots')
-
-    if args.make_predictions_valid:
-        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-    if args.make_plots_valid:
-        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-
-    # evaluate on testing data..
-    if not osp.isdir(outpath+'/test_loader'):
-        os.makedirs(outpath+'/test_loader')
-    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
-        os.makedirs(outpath+'/test_loader/resolution_plots')
-    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
-        os.makedirs(outpath+'/test_loader/distribution_plots')
-    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/test_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
-        os.makedirs(outpath+'/test_loader/efficiency_plots')
-
-    if args.make_predictions_test:
-        if args.load:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-    if args.make_plots_test:
-        if args.load:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-
-
-## -----------------------------------------------------------
-# to retrieve a stored variable in pkl file
-# import pickle as pkl
-# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
-#     a = pkl.load(f)
-#
-# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
-#     data = pkl.load(pickle_file)
-#
-# data.keys()
diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index c0b7cd410..d3ab69d47 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -45,8 +45,8 @@ rm -Rf experiments/*
 cd ../mlpf/pytorch_delphes/
 
 #run the pytorch training
-echo Begining the training..
-python3 training.py \
+echo Beginning the training..
+python3 pipeline.py \
   --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
   --dataset='../../test_tmp_delphes/data/pythia8_ttbar' \
   --dataset_qcd='../../test_tmp_delphes/data/pythia8_qcd' \

From 43aa41b0c78f632163a23eb7d679a67ca93bb7fc Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 27 Jul 2021 13:06:56 +0200
Subject: [PATCH 023/157] add multi-gpu flag for data_preprocessing

---
 mlpf/pytorch_delphes/data_preprocessing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlpf/pytorch_delphes/data_preprocessing.py b/mlpf/pytorch_delphes/data_preprocessing.py
index f504620d3..6e414f690 100644
--- a/mlpf/pytorch_delphes/data_preprocessing.py
+++ b/mlpf/pytorch_delphes/data_preprocessing.py
@@ -4,6 +4,8 @@
 
 # if not multigpu we have to pass batches that are stacked as "batch.type() = Batch" (not list) so that pytorch can access attributes like ygen_id through batch.ygen_id
 # if multigpu we have to pass list of "Data" elements.. then behind the scene, pytorch DP will convert the list to appropriate Batches to fit on the gpus available so that batch.ygen_id works out of the box
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
 
 # define a function that casts the ttbar dataset into a dataloader for efficient NN training
 def data_to_loader_ttbar(full_dataset, n_train, n_valid, batch_size):

From ea41fad5e902b7cdec8de397d9e763e9c49b6d17 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Wed, 28 Jul 2021 15:10:03 +0200
Subject: [PATCH 024/157] feat: Add early stopping to hypertune

Also
  - simplify JUWELS and Flatiron slurm scripts
  - turn off histogram writing in CustomTensorboard
    when hypertuning
---
 mlpf/flatiron/hypertune.slurm       | 4 ++--
 mlpf/hypertune_scripts/run_chief.sh | 2 +-
 mlpf/hypertune_scripts/run_tuner.sh | 2 +-
 mlpf/juwels/hypertune.slurm         | 6 +++---
 mlpf/pipeline.py                    | 9 +++++++--
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/mlpf/flatiron/hypertune.slurm b/mlpf/flatiron/hypertune.slurm
index 9ff411e7c..4c9596f73 100644
--- a/mlpf/flatiron/hypertune.slurm
+++ b/mlpf/flatiron/hypertune.slurm
@@ -55,7 +55,7 @@ export ip_head
 echo "IP Head: $ip_head"
 
 echo "Starting HEAD at $head_node"
-srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port "/mnt/ceph/users/ewulff/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
 sleep 5
 
 # number of nodes other than the head node
@@ -66,7 +66,7 @@ for ((i = 1; i <= worker_num; i++)); do
     echo "Starting WORKER $i at $node_i"
     tunerID="tuner$i"
     srun --nodes=1 --ntasks=1 -w "$node_i" \
-        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port "/mnt/ceph/users/ewulff/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
     sleep 1
 done
 wait # keep the wait statement, it is important
diff --git a/mlpf/hypertune_scripts/run_chief.sh b/mlpf/hypertune_scripts/run_chief.sh
index cbc040cb7..f91a8512f 100755
--- a/mlpf/hypertune_scripts/run_chief.sh
+++ b/mlpf/hypertune_scripts/run_chief.sh
@@ -13,5 +13,5 @@ echo $KERASTUNER_ORACLE_PORT
 
 nvidia-smi
 echo 'Starting chief.'
-CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c parameters/cms-gnn-dense-short.yaml -o $4
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c $4 -o $5
 echo 'Chief done.'
\ No newline at end of file
diff --git a/mlpf/hypertune_scripts/run_tuner.sh b/mlpf/hypertune_scripts/run_tuner.sh
index 663e2902c..efcf648ae 100755
--- a/mlpf/hypertune_scripts/run_tuner.sh
+++ b/mlpf/hypertune_scripts/run_tuner.sh
@@ -13,5 +13,5 @@ echo $KERASTUNER_ORACLE_PORT
 
 nvidia-smi
 echo 'Starting tuner.'
-CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c parameters/cms-gnn-dense-short.yaml -o $4
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py hypertune -c $4 -o $5
 echo 'Tuner done.'
diff --git a/mlpf/juwels/hypertune.slurm b/mlpf/juwels/hypertune.slurm
index 723bbd953..ecacda15f 100644
--- a/mlpf/juwels/hypertune.slurm
+++ b/mlpf/juwels/hypertune.slurm
@@ -2,7 +2,7 @@
 
 #SBATCH --account=prcoe12
 #SBATCH --partition=booster
-#SBATCH --time 0:59:59
+#SBATCH --time 23:59:59
 #SBATCH --nodes 4
 #SBATCH --tasks-per-node=1
 #SBATCH --gres=gpu:4
@@ -61,7 +61,7 @@ export ip_head
 echo "IP Head: $ip_head"
 
 echo "Starting HEAD at $head_node"
-srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port "/p/project/prcoe12/wulff1/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/p/project/prcoe12/wulff1/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
 sleep 5
 
 # number of nodes other than the head node
@@ -72,7 +72,7 @@ for ((i = 1; i <= worker_num; i++)); do
     echo "Starting WORKER $i at $node_i"
     tunerID="tuner$i"
     srun --nodes=1 --ntasks=1 -w "$node_i" \
-        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port "/p/project/prcoe12/wulff1/hypertune_out_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/p/project/prcoe12/wulff1/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
     sleep 1
 done
 wait # keep the wait statement, it is important!
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 956758b52..05ed3ac31 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -319,6 +319,7 @@ def delete_all_but_best_ckpt(train_dir, dry_run):
 @click.option("--ntest", default=None, help="override the number of testing events", type=int)
 @click.option("-r", "--recreate", help="overwrite old hypertune results", is_flag=True, default=False)
 def hypertune(config, outdir, ntrain, ntest, recreate):
+    config_file_path = config
     config, _, global_batch_size, n_train, n_test, n_epochs, _ = parse_config(config, ntrain, ntest)
 
     # Override number of epochs with max_epochs from Hyperband config if specified
@@ -335,13 +336,16 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
     model_builder, optim_callbacks = hypertuning.get_model_builder(config, total_steps)
 
     tb = CustomTensorBoard(
-            log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
+            log_dir=outdir + "/tensorboard_logs", histogram_freq=0, write_graph=False, write_images=False,
             update_freq=1,
         )
     # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
     tb.__class__.__name__ = "TensorBoard"
 
     tuner = get_tuner(config["hypertune"], model_builder, outdir, recreate, strategy)
+    tuner.search_space_summary()
+
+    callbacks = [tb] + optim_callbacks + [tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss')]
 
     tuner.search(
         ds_train_r,
@@ -350,9 +354,10 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         steps_per_epoch=n_train // global_batch_size,
         validation_steps=n_test // global_batch_size,
         #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
-        callbacks=[tb] + optim_callbacks,
+        callbacks=callbacks,
     )
     print("Hyperparamter search complete.")
+    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
     tuner.results_summary()
     for trial in tuner.oracle.get_best_trials(num_trials=10):

From 5cb60020975a3b69f67c3f120993aa3e16982a6f Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 1 Aug 2021 11:47:13 +0300
Subject: [PATCH 025/157] add single class recall, detailed plots

---
 mlpf/pipeline.py            |   6 ++
 mlpf/tfmodel/model_setup.py | 159 +++++++++++++++++++++---------------
 2 files changed, 100 insertions(+), 65 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 742e9b99b..2a7e6335a 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -23,6 +23,7 @@
     LearningRateLoggingCallback,
     prepare_callbacks,
     FlattenedCategoricalAccuracy,
+    SingleClassRecall,
     eval_model,
     freeze_model,
 )
@@ -129,6 +130,11 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
                 "cls": [
                     FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
                     FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ] + [
+                    SingleClassRecall(
+                        icls,
+                        name="rec_cls{}".format(icls),
+                        dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"])
                 ]
             },
         )
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index e2a7a4af4..2526394aa 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -61,8 +61,13 @@ class CustomCallback(tf.keras.callbacks.Callback):
     def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
         super(CustomCallback, self).__init__()
         self.X = X
+
         self.y = y
-        self.dataset_transform = dataset_transform
+
+        #transform the prediction target from an array into a dictionary for easier access
+        self.ytrue = dataset_transform(self.X, self.y, None)[1]
+        self.ytrue_id = np.argmax(self.ytrue["cls"], axis=-1)
+
         self.outpath = outpath
         self.num_output_classes = num_output_classes
 
@@ -81,107 +86,131 @@ def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
             11: "gray"
         }
 
-    def on_epoch_end(self, epoch, logs=None):
-
-        with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
-            json.dump(logs, fi)
+        self.reg_bins = {
+            "pt": np.linspace(0, 50, 100),
+            "eta": np.linspace(-5, 5, 100),
+            "sin_phi": np.linspace(-1,1,100),
+            "cos_phi": np.linspace(-1,1,100),
+            "energy": np.linspace(0,100,100),
+        }
 
-        ypred = self.model(self.X, training=False)
-        #ypred["cls"] = np.clip(ypred["cls"], 0.5, 1.0)
-        
-        ypred_id = np.argmax(ypred["cls"], axis=-1)
+    def plot_cm(self, outpath, ypred_id, msk):
 
-        ibatch = 0
-       
-        msk = self.X[:, :, 0] != 0
-        # cm = sklearn.metrics.confusion_matrix(
-        #     self.y[msk][:, 0].astype(np.int64).flatten(),
-        #     ypred_id[msk].flatten(), labels=list(range(self.num_output_classes))
-        # )
-        # figure = plot_confusion_matrix(cm)
-        # plt.savefig("{}/cm_{}.pdf".format(self.outpath, epoch), bbox_inches="tight")
-        # plt.close("all")
+        ytrue_id_flat = self.ytrue_id[msk].astype(np.int64).flatten()
+        ypred_id_flat = ypred_id[msk].flatten()
 
         cm = sklearn.metrics.confusion_matrix(
-            self.y[msk][:, 0].astype(np.int64).flatten(),
-            ypred_id[msk].flatten(), labels=list(range(self.num_output_classes)), normalize="true"
+            ytrue_id_flat,
+            ypred_id_flat, labels=list(range(self.num_output_classes)), normalize="true"
         )
         figure = plot_confusion_matrix(cm)
 
         acc = sklearn.metrics.accuracy_score(
-            self.y[msk][:, 0].astype(np.int64).flatten(),
-            ypred_id[msk].flatten()
+            ytrue_id_flat,
+            ypred_id_flat
         )
         balanced_acc = sklearn.metrics.balanced_accuracy_score(
-            self.y[msk][:, 0].astype(np.int64).flatten(),
-            ypred_id[msk].flatten()
+            ytrue_id_flat,
+            ypred_id_flat
         )
         plt.title("acc={:.3f} bacc={:.3f}".format(acc, balanced_acc))
-        plt.savefig("{}/cm_normed_{}.pdf".format(self.outpath, epoch), bbox_inches="tight")
+        plt.savefig(str(outpath / "cm_normed.pdf"), bbox_inches="tight")
         plt.close("all")
 
-        # for icls in range(self.num_output_classes):
-        #     fig = plt.figure(figsize=(4,4))
-        #     msk = self.y[:, :, 0] == icls
-        #     msk = msk.flatten()
-        #     b = np.linspace(0,1,21)
-        #     ids = ypred["cls"][:, :, icls].numpy().flatten()
-        #     plt.hist(ids[msk], bins=b, density=True, histtype="step", lw=2)
-        #     plt.hist(ids[~msk], bins=b, density=True, histtype="step", lw=2)
-        #     plt.savefig("{}/cls{}_{}.pdf".format(self.outpath, icls, epoch), bbox_inches="tight")
-        # for icls in range(self.num_output_classes):
-        #     n_pred = np.sum(self.y[:, :, 0]==icls, axis=1)
-        #     n_true = np.sum(ypred_id==icls, axis=1)
-        #     figure = plot_num_particle(n_pred, n_true, icls)
-        #     plt.savefig("{}/num_cls{}_{}.pdf".format(self.outpath, icls, epoch), bbox_inches="tight")
+    def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
 
         fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(3*5, 5))
 
+        #Plot the input PFElements
         plt.axes(ax1)
-        msk = self.X[ibatch, :, 0] != 0
-        eta = self.X[ibatch][msk][:, 2]
-        phi = self.X[ibatch][msk][:, 3]
-        energy = self.X[ibatch][msk][:, 4]
-        typ = self.X[ibatch][msk][:, 0]
+        msk = self.X[ievent, :, 0] != 0
+        eta = self.X[ievent][msk][:, 2]
+        phi = self.X[ievent][msk][:, 3]
+        energy = self.X[ievent][msk][:, 4]
+        typ = self.X[ievent][msk][:, 0]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in typ], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        plt.axes(ax3)
         #Plot the predicted particles
-        msk = ypred_id[ibatch] != 0
-        eta = ypred["eta"][ibatch][msk]
-        sphi = ypred["sin_phi"][ibatch][msk]
-        cphi = ypred["cos_phi"][ibatch][msk]
+        plt.axes(ax3)
+        msk = ypred_id[ievent] != 0
+        eta = ypred["eta"][ievent][msk]
+        sphi = ypred["sin_phi"][ievent][msk]
+        cphi = ypred["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = ypred["energy"][ibatch][msk]
-        pdgid = ypred_id[ibatch][msk]
+        energy = ypred["energy"][ievent][msk]
+        pdgid = ypred_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        # Xconcat = np.concatenate([self.X[ibatch], ypred["cls"][ibatch]], axis=-1)
-        # np.savez(self.outpath + "/event_{}.npz".format(epoch), Xconcat[Xconcat[:, 0]!=0])
-
         #Plot the target particles
         plt.axes(ax2)
-        y = self.dataset_transform(self.X, self.y, None)[1]
-        y_id = np.argmax(y["cls"], axis=-1)
-        msk = y_id[ibatch] != 0
-        eta = y["eta"][ibatch][msk]
-        sphi = y["sin_phi"][ibatch][msk]
-        cphi = y["cos_phi"][ibatch][msk]
+        
+        msk = self.ytrue_id[ievent] != 0
+        eta = self.ytrue["eta"][ievent][msk]
+        sphi = self.ytrue["sin_phi"][ievent][msk]
+        cphi = self.ytrue["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = y["energy"][ibatch][msk]
-        pdgid = y_id[ibatch][msk]
+        energy = self.ytrue["energy"][ievent][msk]
+        pdgid = self.ytrue_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        plt.savefig("{}/event_{}.pdf".format(self.outpath, epoch), bbox_inches="tight")
+        plt.savefig(str(outpath / "event_iev{}.pdf".format(ievent)), bbox_inches="tight")
+        plt.close("all")
+
+    def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
+        vals_pred = ypred[reg_variable].numpy()[msk][ypred_id[msk]==icls].flatten()
+        vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]==icls].flatten()
+
+        bins = self.reg_bins[reg_variable]
+        plt.hist(vals_true, bins=bins, histtype="step", lw=2, label="true")
+        plt.hist(vals_pred, bins=bins, histtype="step", lw=2, label="predicted")
+
+        if reg_variable in ["pt", "energy"]:
+            plt.yscale("log")
+            plt.ylim(bottom=1e-2)
+
+        plt.xlabel(reg_variable)
+        plt.ylabel("Number of particles")
+        plt.legend(loc="best")
+        plt.title("Regression output, cls {}".format(icls))
+        plt.savefig(str(outpath / "{}_cls{}.pdf".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
-        np.savez("{}/pred_{}.npz".format(self.outpath, epoch), X=self.X, ytrue=self.y, **ypred)
+    def on_epoch_end(self, epoch, logs=None):
+
+        #save the training logs (losses) for this epoch
+        with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
+            json.dump(logs, fi)
+
+        cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)
+        cp_dir.mkdir(parents=True, exist_ok=True)
+
+        #run the model inference on the small validation dataset
+        ypred = self.model(self.X, training=False)
+
+        #choose the class with the highest probability as the prediction
+        #this is a shortcut, in actual inference, we may want to apply additional per-class thresholds        
+        ypred_id = np.argmax(ypred["cls"], axis=-1)
+       
+        #exclude padded elements from the plotting
+        msk = self.X[:, :, 0] != 0
+
+        self.plot_cm(cp_dir, ypred_id, msk)
+        for ievent in range(min(5, self.X.shape[0])):
+            self.plot_event_visualization(cp_dir, ypred, ypred_id, msk, ievent=ievent)
+
+        for icls in range(1, self.num_output_classes):
+            cp_dir_cls = cp_dir / "cls_{}".format(icls)
+            cp_dir_cls.mkdir(parents=True, exist_ok=True)
+            for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
+                self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
+
+        np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
 def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes):
     callbacks = []

From cb0712953d8edb91f6706926d48fd1f262deacfd Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 1 Aug 2021 17:35:23 +0300
Subject: [PATCH 026/157] added learnable kernel

---
 mlpf/tfmodel/model.py                         | 48 +++++++++++++++----
 mlpf/tfmodel/model_setup.py                   |  1 +
 ...-dense-big.yaml => cms-gnn-dense-dev.yaml} | 33 +++++++++----
 3 files changed, 64 insertions(+), 18 deletions(-)
 rename parameters/{cms-gnn-dense-big.yaml => cms-gnn-dense-dev.yaml} (76%)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 9e148a6ef..006ef73e6 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -36,6 +36,23 @@ def pairwise_gaussian_dist(A, B):
     D = tf.sqrt(tf.maximum(na - 2*tf.matmul(A, B, False, True) + nb, 1e-6))
     return D
 
+def pairwise_learnable_dist(A, B, ffn):
+    shp = tf.shape(A)
+
+    #stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
+    a, b, c, d = tf.meshgrid(tf.range(shp[0]), tf.range(shp[1]), tf.range(shp[2]), tf.range(shp[2]), indexing="ij")
+    inds1 = tf.stack([a,b,c], axis=-1)
+    inds2 = tf.stack([a,b,d], axis=-1)
+    res = tf.concat([
+        tf.gather_nd(A, inds1),
+        tf.gather_nd(B, inds2)], axis=-1
+    ) #(batch, bin, elem, elem, feat)
+
+    #run a feedforward net on (src, dst) -> 1
+    res_transformed = tf.squeeze(ffn(res), axis=-1)
+
+    return res_transformed
+
 def pairwise_sigmoid_dist(A, B):
     return tf.nn.sigmoid(tf.matmul(A, tf.transpose(B, perm=[0,2,1])))
 
@@ -400,15 +417,24 @@ def construct_sparse_dm_batch(self, points):
 
         return bins_split, sparse_distance_matrix
 
-class ExponentialLSHDistanceDense(tf.keras.layers.Layer):
+class GraphBuilderDense(tf.keras.layers.Layer):
     def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_size=128, dist_mult=0.1, **kwargs):
-        super(ExponentialLSHDistanceDense, self).__init__(**kwargs)
         self.dist_mult = dist_mult
         self.distance_dim = distance_dim
         self.max_num_bins = max_num_bins
         self.bin_size = bin_size
         self.clip_value_low = clip_value_low
-        
+
+        self.kernel = kwargs.pop("kernel")
+
+        if self.kernel == "learnable":
+            self.ffn_dist = point_wise_feed_forward_network(1, 32, num_layers=2, activation="elu")
+        elif self.kernel == "gaussian":
+            pass
+
+        super(GraphBuilderDense, self).__init__(**kwargs)
+
+
     def build(self, input_shape):
         #(n_batch, n_points, n_features)
     
@@ -436,9 +462,12 @@ def call(self, x_dist, x_features, msk):
         x_features_binned = tf.gather(x_features, bins_split, batch_dims=1)
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
-        dm = pairwise_gaussian_dist(x_dist_binned, x_dist_binned)
-        dm = tf.exp(-self.dist_mult*dm)
-        
+        if self.kernel == "learnable":
+            dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist)
+        elif self.kernel == "gaussian":
+            dm = pairwise_gaussian_dist(x_dist_binned, x_dist_binned)
+            dm = tf.exp(-self.dist_mult*dm)
+
         #set the distance matrix to 0 for masked elements
         dm *= msk_f_binned
         shp = tf.shape(msk_f_binned)
@@ -861,12 +890,13 @@ def __init__(self, *args, **kwargs):
         self.num_conv = kwargs.pop("num_conv")
         self.normalize_degrees = kwargs.pop("normalize_degrees")
         self.dropout = kwargs.pop("dropout")
+        self.kernel = kwargs.pop("kernel")
 
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
 
         self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, self.distance_dim)
-        self.dist = ExponentialLSHDistanceDense(clip_value_low=self.clip_value_low, distance_dim=self.distance_dim, max_num_bins=self.max_num_bins , bin_size=self.bin_size, dist_mult=self.dist_mult)
+        self.dist = GraphBuilderDense(clip_value_low=self.clip_value_low, distance_dim=self.distance_dim, max_num_bins=self.max_num_bins , bin_size=self.bin_size, dist_mult=self.dist_mult, kernel=self.kernel)
         self.convs = [GHConvDense(
             activation=tf.keras.activations.elu,
             output_dim=self.output_dim,
@@ -916,6 +946,7 @@ def __init__(self,
             separate_momentum=True,
             input_encoding="cms",
             focal_loss_from_logits=False,
+            graph_kernel="gaussian",
             debug=False
         ):
         super(PFNetDense, self).__init__()
@@ -949,7 +980,8 @@ def __init__(self,
             "clip_value_low": clip_value_low,
             "num_conv": num_conv,
             "normalize_degrees": normalize_degrees,
-            "dropout": dropout
+            "dropout": dropout,
+            "kernel": graph_kernel
         }
         self.cg_id = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
         self.cg_reg = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 2526394aa..74ae626c1 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -324,6 +324,7 @@ def make_gnn_dense(config, dtype):
         "dropout",
         "separate_momentum",
         "input_encoding",
+        "graph_kernel",
         "debug"
     ]
 
diff --git a/parameters/cms-gnn-dense-big.yaml b/parameters/cms-gnn-dense-dev.yaml
similarity index 76%
rename from parameters/cms-gnn-dense-big.yaml
rename to parameters/cms-gnn-dense-dev.yaml
index aa3de4a6f..958663dcd 100644
--- a/parameters/cms-gnn-dense-big.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -23,13 +23,13 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 1.0
+  classification_loss_coef: 5.0
   charge_loss_coef: 0.1
   pt_loss_coef: 1.0
   eta_loss_coef: 0.1
   sin_phi_loss_coef: 1.0
   cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
+  energy_loss_coef: 0.01
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
@@ -41,30 +41,43 @@ tensorflow:
 setup:
   train: yes
   weights:
-  lr: 1e-5
-  batch_size: 1
+  weights_config:
+  lr: 1e-4
+  batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 500
   num_val_files: 100
   dtype: float32
-  sample_weights: inverse_sqrt
   trainable: all
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
 parameters:
   model: gnn_dense
   activation: elu
   layernorm: no
   hidden_dim: 256
-  bin_size: 3200
+  bin_size: 40
   clip_value_low: 0.0
-  num_conv: 3
-  num_gsl: 3
+  num_conv: 2
+  num_gsl: 2
   normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
+  distance_dim: 8
+  dropout: 0.2
+  separate_momentum: yes
+  input_encoding: cms
+  graph_kernel: learnable #gaussian, learnable
+  debug: no
 
 timing:
   num_ev: 100

From 1645d626755706e4fc28ba45ed7a1cce82115f00 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 1 Aug 2021 17:57:30 +0300
Subject: [PATCH 027/157] up

---
 parameters/cms-gnn-dense.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 49d2e1abc..e3c60308f 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -76,6 +76,7 @@ parameters:
   dropout: 0.2
   separate_momentum: yes
   input_encoding: cms
+  graph_kernel: gaussian #gaussian, learnable
   debug: no
 
 timing:

From 117bc6ee0c287797a7c8925f6a68cbf6f0c88c67 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Sun, 1 Aug 2021 18:04:48 +0300
Subject: [PATCH 028/157] update cls mult

---
 mlpf/tallinn/cms-gnn-dense-dev.sh | 10 ++++++++++
 parameters/cms-gnn-dense-dev.yaml |  2 +-
 parameters/cms-gnn-dense.yaml     |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100755 mlpf/tallinn/cms-gnn-dense-dev.sh

diff --git a/mlpf/tallinn/cms-gnn-dense-dev.sh b/mlpf/tallinn/cms-gnn-dense-dev.sh
new file mode 100755
index 000000000..f189465cf
--- /dev/null
+++ b/mlpf/tallinn/cms-gnn-dense-dev.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#SBATCH -p gpu
+#SBATCH --gpus 5
+#SBATCH --mem-per-gpu=8G
+
+IMG=/home/software/singularity/base.simg:latest
+cd ~/particleflow
+
+#TF training
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-gnn-dense-dev.yaml
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 958663dcd..dedba952a 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -23,7 +23,7 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 5.0
+  classification_loss_coef: 10.0
   charge_loss_coef: 0.1
   pt_loss_coef: 1.0
   eta_loss_coef: 0.1
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index e3c60308f..5b16bc050 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -23,7 +23,7 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 5.0
+  classification_loss_coef: 10.0
   charge_loss_coef: 0.1
   pt_loss_coef: 1.0
   eta_loss_coef: 0.1

From 37fc2b8d00d19f9d4e98577cdaa45fe56bd289c2 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Sun, 1 Aug 2021 18:42:09 +0300
Subject: [PATCH 029/157] apply exp

---
 mlpf/tfmodel/model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 006ef73e6..4d5ff8154 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -463,10 +463,11 @@ def call(self, x_dist, x_features, msk):
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
         if self.kernel == "learnable":
-            dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist)
+            dm = tf.keras.activations.relu(pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist))
         elif self.kernel == "gaussian":
             dm = pairwise_gaussian_dist(x_dist_binned, x_dist_binned)
-            dm = tf.exp(-self.dist_mult*dm)
+        
+        dm = tf.exp(-self.dist_mult*dm)
 
         #set the distance matrix to 0 for masked elements
         dm *= msk_f_binned

From ab70ee4974a013da1bfe1495de07fae5890ef11a Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 12:02:07 +0200
Subject: [PATCH 030/157] feat: Choose if to draw events during training or not

Also, history json files are written by CustomTensorBoard
instead of CustomCallback since Hypertune should write the
history files but don't need the rest of CustomCallback.
---
 mlpf/pipeline.py                       | 13 ++++------
 mlpf/tfmodel/callbacks.py              |  8 ++++++-
 mlpf/tfmodel/model_setup.py            | 33 +++++++++++++-------------
 parameters/cms-gnn-dense-onecycle.yaml |  7 ++++++
 parameters/cms-gnn-dense.yaml          |  7 ++++++
 parameters/test-cms-v2.yaml            |  7 ++++++
 parameters/test-cms.yaml               |  7 ++++++
 parameters/test-delphes.yaml           |  7 ++++++
 8 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 05ed3ac31..8cb9c29c2 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -138,7 +138,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         model.summary()
 
         callbacks = prepare_callbacks(
-            model,
+            config["callbacks"],
             outdir,
             X_val[: config["setup"]["batch_size"]],
             ycand_val[: config["setup"]["batch_size"]],
@@ -335,18 +335,13 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
 
     model_builder, optim_callbacks = hypertuning.get_model_builder(config, total_steps)
 
-    tb = CustomTensorBoard(
-            log_dir=outdir + "/tensorboard_logs", histogram_freq=0, write_graph=False, write_images=False,
-            update_freq=1,
-        )
-    # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
-    tb.__class__.__name__ = "TensorBoard"
+    callbacks = prepare_callbacks(config["callbacks"], outdir)
+    callbacks.append(optim_callbacks)
+    callbacks.append(tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss'))
 
     tuner = get_tuner(config["hypertune"], model_builder, outdir, recreate, strategy)
     tuner.search_space_summary()
 
-    callbacks = [tb] + optim_callbacks + [tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss')]
-
     tuner.search(
         ds_train_r,
         epochs=n_epochs,
diff --git a/mlpf/tfmodel/callbacks.py b/mlpf/tfmodel/callbacks.py
index 6edfddcda..5ad2f760a 100644
--- a/mlpf/tfmodel/callbacks.py
+++ b/mlpf/tfmodel/callbacks.py
@@ -4,7 +4,7 @@
 from tensorflow.keras.callbacks import ModelCheckpoint
 from pathlib import Path
 import numpy as np
-
+import json
 
 class CustomTensorBoard(TensorBoard):
     """
@@ -16,6 +16,9 @@ class CustomTensorBoard(TensorBoard):
 
     Also logs momemtum for supported optimizers that use momemtum.
     """
+    def __init__(self, *args, **kwargs):
+        self.dump_history = kwargs.pop("dump_history")
+        super().__init__(*args, **kwargs)
 
     def _collect_learning_rate(self, logs):
         logs = logs or {}
@@ -40,6 +43,9 @@ def _collect_learning_rate(self, logs):
     def on_epoch_end(self, epoch, logs):
         logs = logs or {}
         logs.update(self._collect_learning_rate(logs))
+        if self.dump_history:
+            with open("{}/history_{}.json".format(self.log_dir, epoch), "w") as fi:
+                json.dump(logs, fi)
         super().on_epoch_end(epoch, logs)
 
     def on_train_batch_end(self, batch, logs):
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 374f1c804..e8894deb0 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -57,6 +57,7 @@ def plot_to_image(figure):
     
     return image
 
+
 class CustomCallback(tf.keras.callbacks.Callback):
     def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
         super(CustomCallback, self).__init__()
@@ -83,9 +84,6 @@ def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
 
     def on_epoch_end(self, epoch, logs=None):
 
-        with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
-            json.dump(logs, fi)
-
         ypred = self.model(self.X, training=False)
         #ypred["cls"] = np.clip(ypred["cls"], 0.5, 1.0)
         
@@ -183,15 +181,17 @@ def on_epoch_end(self, epoch, logs=None):
 
         np.savez("{}/pred_{}.npz".format(self.outpath, epoch), X=self.X, ytrue=self.y, **ypred)
 
-def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes):
+def prepare_callbacks(callbacks_cfg, outdir, X_val=None, y_val=None, dataset_transform=None, num_output_classes=None):
     callbacks = []
     tb = CustomTensorBoard(
         log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
         update_freq='epoch',
         #profile_batch=(10,90),
         profile_batch=0,
+        dump_history=callbacks_cfg["tensorboard"]["dump_history"],
     )
-    tb.set_model(model)
+    # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
+    tb.__class__.__name__ = "TensorBoard"
     callbacks += [tb]
 
     terminate_cb = tf.keras.callbacks.TerminateOnNaN()
@@ -201,19 +201,19 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     cp_dir.mkdir(parents=True, exist_ok=True)
     cp_callback = tf.keras.callbacks.ModelCheckpoint(
         filepath=str(cp_dir / "weights-{epoch:02d}-{val_loss:.6f}.hdf5"),
-        save_weights_only=True,
-        verbose=0
+        save_weights_only=callbacks_cfg["checkpoint"]["save_weights_only"],
+        verbose=0,
+        monitor=callbacks_cfg["checkpoint"]["monitor"],
+        save_best_only=callbacks_cfg["checkpoint"]["save_best_only"],
     )
-    cp_callback.set_model(model)
     callbacks += [cp_callback]
 
-    history_path = Path(outdir) / "history"
-    history_path.mkdir(parents=True, exist_ok=True)
-    history_path = str(history_path)
-    cb = CustomCallback(history_path, X_val, y_val, dataset_transform, num_output_classes)
-    cb.set_model(model)
-
-    callbacks += [cb]
+    if callbacks_cfg["draw_events"]:
+        history_path = Path(outdir) / "history"
+        history_path.mkdir(parents=True, exist_ok=True)
+        history_path = str(history_path)
+        cb = CustomCallback(history_path, X_val, y_val, dataset_transform, num_output_classes)
+        callbacks += [cb]
 
     return callbacks
 
@@ -707,7 +707,8 @@ def main(args, yaml_path, config):
             if args.action=="train":
                 #file_writer_cm = tf.summary.create_file_writer(outdir + '/val_extra')
                 callbacks = prepare_callbacks(
-                    model, outdir, X_val[:config['setup']['batch_size']], ycand_val[:config['setup']['batch_size']],
+                    config["callbacks"],
+                    outdir, X_val[:config['setup']['batch_size']], ycand_val[:config['setup']['batch_size']],
                     dataset_transform, config["dataset"]["num_output_classes"]
                 )
                 callbacks.append(optim_callbacks)
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 3313d875c..aab3c3cb6 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -107,6 +107,13 @@ onecycle:
   div_factor: 25.0
   final_div: 100000.0
 
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  draw_events: no
+
 hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
   random:
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index ae5f6768f..3276e9ea9 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -98,6 +98,13 @@ exponentialdecay:
   decay_rate: 0.99
   staircase: yes
 
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  draw_events: no
+
 hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
   random:
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 10ad0e44e..d19cfcf9f 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -84,3 +84,10 @@ exponentialdecay:
   decay_steps: 10000
   decay_rate: 0.99
   staircase: yes
+
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  draw_events: no
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index 4de5275e7..a17910e08 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -86,3 +86,10 @@ exponentialdecay:
   decay_steps: 10000
   decay_rate: 0.99
   staircase: yes
+
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  draw_events: no
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index caccb02fc..d1d2e434e 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -85,3 +85,10 @@ exponentialdecay:
   decay_steps: 10000
   decay_rate: 0.99
   staircase: yes
+
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  draw_events: no

From bd964aa28f058ae9aa8112b94e40072bf9f4c382 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 16:03:57 +0200
Subject: [PATCH 031/157] chore: CustomTensorBoard saves history files in a
 subfolder

---
 mlpf/tfmodel/callbacks.py              |  5 ++++-
 mlpf/tfmodel/hypertuning.py            | 12 ++++++------
 parameters/cms-gnn-dense-onecycle.yaml |  2 ++
 parameters/cms-gnn-dense.yaml          |  2 ++
 parameters/test-cms-v2.yaml            |  2 ++
 parameters/test-cms.yaml               |  2 ++
 parameters/test-delphes.yaml           |  2 ++
 7 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/mlpf/tfmodel/callbacks.py b/mlpf/tfmodel/callbacks.py
index 5ad2f760a..f73c0e5aa 100644
--- a/mlpf/tfmodel/callbacks.py
+++ b/mlpf/tfmodel/callbacks.py
@@ -44,7 +44,10 @@ def on_epoch_end(self, epoch, logs):
         logs = logs or {}
         logs.update(self._collect_learning_rate(logs))
         if self.dump_history:
-            with open("{}/history_{}.json".format(self.log_dir, epoch), "w") as fi:
+            history_path = Path(self.log_dir) / "history"
+            history_path.mkdir(parents=True, exist_ok=True)
+            history_path = str(history_path)
+            with open("{}/history_{}.json".format(history_path, epoch), "w") as fi:
                 json.dump(logs, fi)
         super().on_epoch_end(epoch, logs)
 
diff --git a/mlpf/tfmodel/hypertuning.py b/mlpf/tfmodel/hypertuning.py
index e57512fc3..4f5e9b486 100644
--- a/mlpf/tfmodel/hypertuning.py
+++ b/mlpf/tfmodel/hypertuning.py
@@ -18,16 +18,16 @@
 def get_model_builder(config, total_steps):
     _, optim_callbacks = get_lr_schedule(config, steps=total_steps)
     def model_builder(hp):
-        config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[256])
-        config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
+        # config["parameters"]["hidden_dim"] = hp.Choice("hidden_dim", values=[256])
+        # config["parameters"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
         config["parameters"]["num_conv"] = hp.Choice("num_conv", [2, 3])
         config["parameters"]["num_gsl"] = hp.Choice("num_gsl", [2, 3])
-        config["parameters"]["dropout"] = hp.Choice("dropout", values=[0.2])
-        config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[640])
+        # config["parameters"]["dropout"] = hp.Choice("dropout", values=[0.2])
+        # config["parameters"]["bin_size"] = hp.Choice("bin_size", values=[640])
 
         config["setup"]["lr"] = hp.Choice("lr", values=[1e-4, 3e-4])
-        config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"])
-        config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])
+        # config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"])
+        # config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])
 
 
         model = make_model(config, dtype="float32")
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index aab3c3cb6..35c3b6f47 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -113,6 +113,8 @@ callbacks:
     monitor: "val_loss"
     save_best_only: no
   draw_events: no
+  tensorboard:
+    dump_history: yes
 
 hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 3276e9ea9..c1739ab09 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -104,6 +104,8 @@ callbacks:
     monitor: "val_loss"
     save_best_only: no
   draw_events: no
+  tensorboard:
+    dump_history: yes
 
 hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index d19cfcf9f..73efbb412 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -91,3 +91,5 @@ callbacks:
     monitor: "val_loss"
     save_best_only: no
   draw_events: no
+  tensorboard:
+    dump_history: yes
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index a17910e08..69a34043a 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -93,3 +93,5 @@ callbacks:
     monitor: "val_loss"
     save_best_only: no
   draw_events: no
+  tensorboard:
+    dump_history: yes
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index d1d2e434e..dd8fb4ce7 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -92,3 +92,5 @@ callbacks:
     monitor: "val_loss"
     save_best_only: no
   draw_events: no
+  tensorboard:
+    dump_history: yes

From c10cd67198b236432b9e126cce860307ed3418fa Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 16:44:03 +0200
Subject: [PATCH 032/157] fix: get history path from tensorboard callback in
 train scripts

---
 mlpf/pipeline.py                       | 2 +-
 mlpf/tfmodel/model_setup.py            | 5 +++--
 parameters/cms-gnn-dense-onecycle.yaml | 1 +
 parameters/cms-gnn-dense.yaml          | 1 +
 parameters/test-cms-v2.yaml            | 1 +
 parameters/test-cms.yaml               | 1 +
 parameters/test-delphes.yaml           | 1 +
 7 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 8cb9c29c2..07d0c6424 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -156,7 +156,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
             validation_steps=n_test // global_batch_size,
             initial_epoch=initial_epoch,
         )
-        history_path = Path(outdir) / "history"
+        history_path = Path(callbacks[0].log_dir) / "history"
         history_path = str(history_path)
         with open("{}/history.json".format(history_path), "w") as fi:
             json.dump(fit_result.history, fi)
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index e8894deb0..5b324c551 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -182,9 +182,10 @@ def on_epoch_end(self, epoch, logs=None):
         np.savez("{}/pred_{}.npz".format(self.outpath, epoch), X=self.X, ytrue=self.y, **ypred)
 
 def prepare_callbacks(callbacks_cfg, outdir, X_val=None, y_val=None, dataset_transform=None, num_output_classes=None):
+    # This should return a list with the CustomTensorBoard callback as the first element
     callbacks = []
     tb = CustomTensorBoard(
-        log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
+        log_dir=outdir + "/logs", histogram_freq=callbacks_cfg["tensorboard"]["hist_freq"], write_graph=False, write_images=False,
         update_freq='epoch',
         #profile_batch=(10,90),
         profile_batch=0,
@@ -718,7 +719,7 @@ def main(args, yaml_path, config):
                     steps_per_epoch=n_train//global_batch_size, validation_steps=n_test//global_batch_size,
                     initial_epoch=initial_epoch
                 )
-                history_path = Path(outdir) / "history"
+                history_path = Path(callbacks[0].log_dir) / "history"
                 history_path = str(history_path)
                 with open("{}/history.json".format(history_path), "w") as fi:
                     json.dump(fit_result.history, fi)
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 35c3b6f47..b62a9c34e 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -115,6 +115,7 @@ callbacks:
   draw_events: no
   tensorboard:
     dump_history: yes
+    hist_freq: 1
 
 hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index c1739ab09..7bd292e6c 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -106,6 +106,7 @@ callbacks:
   draw_events: no
   tensorboard:
     dump_history: yes
+    hist_freq: 1
 
 hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 73efbb412..fa71a8ef8 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -93,3 +93,4 @@ callbacks:
   draw_events: no
   tensorboard:
     dump_history: yes
+    hist_freq: 1
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index 69a34043a..6dc64faa5 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -95,3 +95,4 @@ callbacks:
   draw_events: no
   tensorboard:
     dump_history: yes
+    hist_freq: 1
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index dd8fb4ce7..f006d0b31 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -94,3 +94,4 @@ callbacks:
   draw_events: no
   tensorboard:
     dump_history: yes
+    hist_freq: 1

From c17e91c1cb0f0887a4cae200214b1bbe50fe1df1 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 19:04:21 +0200
Subject: [PATCH 033/157] fix: convert logs dict values to float

---
 mlpf/tfmodel/callbacks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/callbacks.py b/mlpf/tfmodel/callbacks.py
index f73c0e5aa..2545a0e7f 100644
--- a/mlpf/tfmodel/callbacks.py
+++ b/mlpf/tfmodel/callbacks.py
@@ -48,7 +48,8 @@ def on_epoch_end(self, epoch, logs):
             history_path.mkdir(parents=True, exist_ok=True)
             history_path = str(history_path)
             with open("{}/history_{}.json".format(history_path, epoch), "w") as fi:
-                json.dump(logs, fi)
+                converted_logs = {k: float(v) for k, v in logs.items()}
+                json.dump(converted_logs, fi)
         super().on_epoch_end(epoch, logs)
 
     def on_train_batch_end(self, batch, logs):

From ad5085957139f4840e778a4dacb1a6ec5d59d2e3 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 27 Jul 2021 20:53:42 +0200
Subject: [PATCH 034/157] feat: Add raytune command to pipeline

Perform hyperparameter optimization using Ray Tune. Support
for multi-worker search on HPC systems not yet implemented.
---
 mlpf/pipeline.py | 168 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 07d0c6424..0c6f29d51 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -11,6 +11,7 @@
 import click
 from tqdm import tqdm
 import shutil
+from functools import partial
 
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
@@ -53,6 +54,12 @@
 from tfmodel.callbacks import CustomTensorBoard
 from tfmodel import hypertuning
 
+from ray import tune
+from ray.tune.integration.keras import TuneReportCheckpointCallback
+from ray.tune.schedulers import AsyncHyperBandScheduler
+from ray.tune.integration.tensorflow import DistributedTrainableCreator
+from ray.tune.logger import TBXLoggerCallback
+
 
 @click.group()
 @click.help_option("-h", "--help")
@@ -359,5 +366,166 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         print(trial.hyperparameters.values, trial.score)
 
 
+def set_raytune_search_parameters(search_space, config):
+    config["parameters"]["hidden_dim"] = search_space["hidden_dim"]
+    config["parameters"]["distance_dim"] = search_space["distance_dim"]
+    config["parameters"]["num_conv"] = search_space["num_conv"]
+    config["parameters"]["num_gsl"] = search_space["num_gsl"]
+    config["parameters"]["dropout"] = search_space["dropout"]
+    config["parameters"]["bin_size"] = search_space["bin_size"]
+    config["parameters"]["clip_value_low"] = search_space["clip_value_low"]
+    config["parameters"]["normalize_degrees"] = search_space["normalize_degrees"]
+
+    config["setup"]["lr"] = search_space["lr"]
+    return config
+
+
+def build_model_and_train(config, checkpoint_dir=None, full_config=None):
+        full_config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(full_config)
+
+        if config is not None:
+            full_config = set_raytune_search_parameters(search_space=config, config=full_config)
+
+        ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(full_config, global_batch_size, n_train, n_test)
+
+        strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+        if maybe_global_batch_size is not None:
+            global_batch_size = maybe_global_batch_size
+        total_steps = n_epochs * n_train // global_batch_size
+
+        with strategy.scope():
+            lr_schedule, optim_callbacks = get_lr_schedule(full_config, steps=total_steps)
+            opt = get_optimizer(full_config, lr_schedule)
+
+            model = make_model(full_config, dtype=tf.dtypes.float32)
+
+            # Run model once to build the layers
+            model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
+
+            full_config = set_config_loss(full_config, full_config["setup"]["trainable"])
+            configure_model_weights(model, full_config["setup"]["trainable"])
+            model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
+
+            loss_dict, loss_weights = get_loss_dict(full_config)
+            model.compile(
+                loss=loss_dict,
+                optimizer=opt,
+                sample_weight_mode="temporal",
+                loss_weights=loss_weights,
+                metrics={
+                    "cls": [
+                        FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                        FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                    ]
+                },
+            )
+            model.summary()
+
+            # TODO: Use the prepare_callback() function, possibly change its behaviour first
+            callbacks = []
+            tb = CustomTensorBoard(
+                log_dir=tune.get_trial_dir() + "/tensorboard_logs",
+                histogram_freq=0, write_graph=False, write_images=False,
+                update_freq='batch',
+                #profile_batch=(10,90),
+                profile_batch=0,
+            )
+            callbacks += [tb]
+            terminate_cb = tf.keras.callbacks.TerminateOnNaN()
+            callbacks.append(terminate_cb)
+            callbacks.append(optim_callbacks)  # Will be empty if using expdecay
+            callbacks.append(TuneReportCheckpointCallback(
+                metrics=[
+                    "adam_beta_1",
+                    'charge_loss',
+                    "cls_acc_unweighted",
+                    "cls_loss",
+                    "cos_phi_loss",
+                    "energy_loss",
+                    "eta_loss",
+                    "learning_rate",
+                    "loss",
+                    "pt_loss",
+                    "sin_phi_loss",
+                    "val_charge_loss",
+                    "val_cls_acc_unweighted",
+                    "val_cls_acc_weighted",
+                    "val_cls_loss",
+                    "val_cos_phi_loss",
+                    "val_energy_loss",
+                    "val_eta_loss",
+                    "val_loss",
+                    "val_pt_loss",
+                    "val_sin_phi_loss",
+                    ],
+                ),
+            )
+
+            fit_result = model.fit(
+                ds_train_r,
+                validation_data=ds_test_r,
+                epochs=n_epochs,
+                callbacks=callbacks,
+                steps_per_epoch=n_train // global_batch_size,
+                validation_steps=n_test // global_batch_size,
+            )
+
+
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-n", "--name", help="experiment name", type=str, default="test_exp")
+def raytune(config, name):
+    config_file_path = config
+
+    search_space = {
+        "lr": tune.grid_search([1e-4]),
+
+        "hidden_dim": tune.grid_search([256]),
+        "distance_dim": tune.grid_search([128]),
+        "num_conv": tune.grid_search([2, 3]),
+        "num_gsl": tune.grid_search([2]),
+        "dropout": tune.grid_search([0.0, 0.1]),
+        "bin_size": tune.grid_search([640]),
+        "clip_value_low": tune.grid_search([0.0]),
+        "normalize_degrees": tune.grid_search([True]),
+    }
+
+    sched = AsyncHyperBandScheduler(
+        metric="val_loss",
+        mode="min",
+        time_attr="training_iteration",
+        max_t=550,
+        grace_period=20,
+        reduction_factor=3,
+        brackets=1,
+    )
+
+    distributed_trainable = DistributedTrainableCreator(
+        partial(build_model_and_train, full_config=config_file_path),
+        num_workers=1,  # Number of hosts that each trial is expected to use.
+        num_cpus_per_worker=32,
+        num_gpus_per_worker=4,
+        num_workers_per_host=1,  # Number of workers to colocate per host. None if not specified.
+    )
+
+    analysis = tune.run(
+        distributed_trainable,
+        config=search_space,
+        name=name,
+        scheduler=sched,
+        # metric="val_loss",
+        # mode="min",
+        stop={"training_iteration": 32},
+        num_samples=1,
+        # resources_per_trial={
+        #     "cpu": 16,
+        #     "gpu": 4
+        # },
+        local_dir="./ray_results",
+        callbacks=[TBXLoggerCallback()]
+    )
+    print("Best hyperparameters found were: ", analysis.get_best_config("val_loss", "min"))
+
 if __name__ == "__main__":
     main()

From b033d94298dfaadfc01623675ece2140ecf953cd Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 19:46:51 +0200
Subject: [PATCH 035/157] feat: Distributed hyperparameter search on Flatiron
 with Ray Tune

---
 mlpf/flatiron/raytune.sh      | 66 +++++++++++++++++++++++++++++++++++
 mlpf/flatiron/start-head.sh   |  9 +++++
 mlpf/flatiron/start-worker.sh |  8 +++++
 mlpf/pipeline.py              |  8 ++++-
 4 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100755 mlpf/flatiron/raytune.sh
 create mode 100755 mlpf/flatiron/start-head.sh
 create mode 100755 mlpf/flatiron/start-worker.sh

diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
new file mode 100755
index 000000000..a30cbdb9a
--- /dev/null
+++ b/mlpf/flatiron/raytune.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+#SBATCH -t 168:00:00
+#SBATCH -N 4
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --constraint=a100,sxm4
+#SBATCH --gpus-per-task=4
+#SBATCH --cpus-per-task=16
+
+# Job name
+#SBATCH -J raytune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
+# This script is a modification to the implementation suggest by gregSchwartz18 here:
+# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
+redis_password=$(uuidgen)
+export redis_password
+echo "Redis password: ${redis_password}"
+
+nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
+nodes_array=( $nodes )
+
+node_1=${nodes_array[0]} 
+ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
+port=6379
+ip_head=$ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "STARTING HEAD at $node_1"
+srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip &
+sleep 30
+
+worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
+for ((  i=1; i<=$worker_num; i++ ))
+do
+  node_i=${nodes_array[$i]}
+  echo "STARTING WORKER $i at $node_i"
+  srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head &
+  sleep 5
+done
+##############################################################################################
+
+#### call your code below
+python3 mlpf/pipeline.py raytune -c $1 -n $2
+exit
diff --git a/mlpf/flatiron/start-head.sh b/mlpf/flatiron/start-head.sh
new file mode 100755
index 000000000..59f8cdc24
--- /dev/null
+++ b/mlpf/flatiron/start-head.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray head node"
+# Launch the head node
+ray start --head --node-ip-address=$1 --port=6379
+sleep infinity
diff --git a/mlpf/flatiron/start-worker.sh b/mlpf/flatiron/start-worker.sh
new file mode 100755
index 000000000..ce7a6d009
--- /dev/null
+++ b/mlpf/flatiron/start-worker.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray worker node"
+ray start --address $1
+sleep infinity
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 0c6f29d51..7acd60e81 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -54,6 +54,7 @@
 from tfmodel.callbacks import CustomTensorBoard
 from tfmodel import hypertuning
 
+import ray
 from ray import tune
 from ray.tune.integration.keras import TuneReportCheckpointCallback
 from ray.tune.schedulers import AsyncHyperBandScheduler
@@ -476,6 +477,7 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None):
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-n", "--name", help="experiment name", type=str, default="test_exp")
 def raytune(config, name):
+    ray.init(address='auto')
     config_file_path = config
 
     search_space = {
@@ -523,9 +525,13 @@ def raytune(config, name):
         #     "gpu": 4
         # },
         local_dir="./ray_results",
-        callbacks=[TBXLoggerCallback()]
+        callbacks=[TBXLoggerCallback()],
+        log_to_file=True,
     )
     print("Best hyperparameters found were: ", analysis.get_best_config("val_loss", "min"))
 
+    ray.shutdown()
+
+
 if __name__ == "__main__":
     main()

From d35b2ec944976d5b0de09b01106d0954448e9b6f Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 22:11:59 +0200
Subject: [PATCH 036/157] chore: Use prepare_callbacks() in raytune command

---
 mlpf/pipeline.py | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 7acd60e81..ed9cb01d1 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -422,19 +422,9 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None):
             )
             model.summary()
 
-            # TODO: Use the prepare_callback() function, possibly change its behaviour first
-            callbacks = []
-            tb = CustomTensorBoard(
-                log_dir=tune.get_trial_dir() + "/tensorboard_logs",
-                histogram_freq=0, write_graph=False, write_images=False,
-                update_freq='batch',
-                #profile_batch=(10,90),
-                profile_batch=0,
-            )
-            callbacks += [tb]
-            terminate_cb = tf.keras.callbacks.TerminateOnNaN()
-            callbacks.append(terminate_cb)
-            callbacks.append(optim_callbacks)  # Will be empty if using expdecay
+            callbacks = prepare_callbacks(full_config["callbacks"], tune.get_trial_dir())
+            callbacks.append(optim_callbacks)
+
             callbacks.append(TuneReportCheckpointCallback(
                 metrics=[
                     "adam_beta_1",
@@ -476,8 +466,12 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None):
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-n", "--name", help="experiment name", type=str, default="test_exp")
-def raytune(config, name):
-    ray.init(address='auto')
+@click.option("-l", "--local", help="run locally", is_flag=True)
+@click.option("--cpus", help="number of cpus per worker", type=int, default=1)
+@click.option("--gpus", help="number of gpus per worker", type=int, default=0)
+def raytune(config, name, local, cpus, gpus):
+    if not local:
+        ray.init(address='auto')
     config_file_path = config
 
     search_space = {
@@ -506,8 +500,8 @@ def raytune(config, name):
     distributed_trainable = DistributedTrainableCreator(
         partial(build_model_and_train, full_config=config_file_path),
         num_workers=1,  # Number of hosts that each trial is expected to use.
-        num_cpus_per_worker=32,
-        num_gpus_per_worker=4,
+        num_cpus_per_worker=cpus,
+        num_gpus_per_worker=gpus,
         num_workers_per_host=1,  # Number of workers to colocate per host. None if not specified.
     )
 

From 9391a574002e505e7a03708a9b9e6a08012e5066 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 22:23:55 +0200
Subject: [PATCH 037/157] chore: add ray to github python installations

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e870ef5d2..6b02e2a0b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -44,7 +44,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]'
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_pipeline.sh
 
@@ -57,7 +57,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]'
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
 

From 54aef72adba8bfef27de2f179467b0d91e2ce8b0 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 3 Aug 2021 22:33:37 +0200
Subject: [PATCH 038/157] chore: Add ray[tune] to github tests python deps

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6b02e2a0b..6c273c3e8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -44,7 +44,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]'
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_pipeline.sh
 
@@ -57,7 +57,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]'
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
 

From ba5ca387b78939ed13864f97f1cacbd5ea5c65c1 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 5 Aug 2021 12:39:29 +0200
Subject: [PATCH 039/157] feat: Choose between ASHA and Hyperband in raytune
 config

---
 mlpf/flatiron/raytune.sh               |  2 +-
 mlpf/pipeline.py                       | 17 +++++----------
 mlpf/tfmodel/utils.py                  | 24 +++++++++++++++++++++
 parameters/cms-gnn-dense-onecycle.yaml | 25 ++++++++++++++++++++++
 parameters/cms-gnn-dense.yaml          | 29 ++++++++++++++++++++++++--
 5 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
index a30cbdb9a..06ab48580 100755
--- a/mlpf/flatiron/raytune.sh
+++ b/mlpf/flatiron/raytune.sh
@@ -62,5 +62,5 @@ done
 ##############################################################################################
 
 #### call your code below
-python3 mlpf/pipeline.py raytune -c $1 -n $2
+python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" --gpus "${SLURM_GPUS_PER_TASK}"
 exit
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ed9cb01d1..c30362e2e 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -48,6 +48,7 @@
     get_best_checkpoint,
     delete_all_but_best_checkpoint,
     get_tuner,
+    get_raytune_schedule,
 )
 
 from tfmodel.lr_finder import LRFinder
@@ -57,7 +58,6 @@
 import ray
 from ray import tune
 from ray.tune.integration.keras import TuneReportCheckpointCallback
-from ray.tune.schedulers import AsyncHyperBandScheduler
 from ray.tune.integration.tensorflow import DistributedTrainableCreator
 from ray.tune.logger import TBXLoggerCallback
 
@@ -470,6 +470,7 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None):
 @click.option("--cpus", help="number of cpus per worker", type=int, default=1)
 @click.option("--gpus", help="number of gpus per worker", type=int, default=0)
 def raytune(config, name, local, cpus, gpus):
+    cfg = load_config(config)
     if not local:
         ray.init(address='auto')
     config_file_path = config
@@ -487,15 +488,7 @@ def raytune(config, name, local, cpus, gpus):
         "normalize_degrees": tune.grid_search([True]),
     }
 
-    sched = AsyncHyperBandScheduler(
-        metric="val_loss",
-        mode="min",
-        time_attr="training_iteration",
-        max_t=550,
-        grace_period=20,
-        reduction_factor=3,
-        brackets=1,
-    )
+    sched = get_raytune_schedule(cfg["raytune"])
 
     distributed_trainable = DistributedTrainableCreator(
         partial(build_model_and_train, full_config=config_file_path),
@@ -512,13 +505,13 @@ def raytune(config, name, local, cpus, gpus):
         scheduler=sched,
         # metric="val_loss",
         # mode="min",
-        stop={"training_iteration": 32},
+        # stop={"training_iteration": 32},
         num_samples=1,
         # resources_per_trial={
         #     "cpu": 16,
         #     "gpu": 4
         # },
-        local_dir="./ray_results",
+        local_dir=cfg["raytune"]["local_dir"],
         callbacks=[TBXLoggerCallback()],
         log_to_file=True,
     )
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index a9a3161d9..5830cb374 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -16,6 +16,8 @@
 from tfmodel.data import Dataset
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 
+from ray.tune.schedulers import AsyncHyperBandScheduler, HyperBandScheduler
+
 
 def load_config(config_file_path):
     with open(config_file_path, "r") as ymlfile:
@@ -155,6 +157,7 @@ def get_optimizer(config, lr_schedule=None):
     else:
         raise ValueError("Only 'adam' and 'sgd' are supported optimizers, got {}".format(config["setup"]["optimizer"]))
 
+
 def get_tuner(cfg_hypertune, model_builder, outdir, recreate, strategy):
     if cfg_hypertune["algorithm"] == "random":
         print("Keras Tuner: Using RandomSearch")
@@ -194,6 +197,27 @@ def get_tuner(cfg_hypertune, model_builder, outdir, recreate, strategy):
         )
 
 
+def get_raytune_schedule(raytune_cfg):
+    if raytune_cfg["sched"] == "asha":
+        return AsyncHyperBandScheduler(
+            metric="val_loss",
+            mode="min",
+            time_attr="training_iteration",
+            max_t=raytune_cfg["asha"]["max_t"],
+            grace_period=raytune_cfg["asha"]["grace_period"],
+            reduction_factor=raytune_cfg["asha"]["reduction_factor"],
+            brackets=raytune_cfg["asha"]["brackets"],
+        )
+    if raytune_cfg["sched"] == "hyperband":
+        return HyperBandScheduler(
+            metric="val_loss",
+            mode="min",
+            time_attr="training_iteration",
+            max_t=raytune_cfg["hyperband"]["max_t"],
+            reduction_factor=raytune_cfg["hyperband"]["reduction_factor"],
+        )
+
+
 def compute_weights_invsqrt(X, y, w):
     wn = tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w)
     wn *= tf.cast(X[:, 0] != 0, tf.float32)
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index b62a9c34e..a003d6952 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -132,3 +132,28 @@ hypertune:
     factor: 2
     iterations: 1
     executions_per_trial: 1
+
+raytune:
+  local_dir:
+  sched: "asha"
+  parameters:
+    # optimizer parameters
+    lr: [1e-4]
+    # model parameters
+    hidden_dim: [256]
+    distance_dim: [128, 256]
+    num_conv: [2, 3, 4]
+    num_gsl: [2, 3, 4]
+    dropout: [0.0, 0.1]
+    bin_size: [640]
+    clip_value_low: [0.0]
+    normalize_degrees: [True]
+  # Tune schedule specific parameters
+  asha:
+    max_t: 10
+    reduction_factor: 3
+    brackets: 1
+    grace_period: 5
+  hyperband:
+    max_t: 10
+    reduction_factor: 3
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 7bd292e6c..d05f2d6b5 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -112,10 +112,10 @@ hypertune:
   algorithm: hyperband  # random, bayesian, hyperband
   random:
     objective: val_loss
-    max_trials: 1000
+    max_trials: 100
   bayesian:
     objective: val_loss
-    max_trials: 1000
+    max_trials: 100
     num_initial_points: 2
   hyperband:
     objective: val_loss
@@ -123,3 +123,28 @@ hypertune:
     factor: 2
     iterations: 1
     executions_per_trial: 1
+
+raytune:
+  local_dir:
+  sched: "asha"
+  parameters:
+    # optimizer parameters
+    lr: [1e-4]
+    # model parameters
+    hidden_dim: [256]
+    distance_dim: [128, 256]
+    num_conv: [2, 3, 4]
+    num_gsl: [2, 3, 4]
+    dropout: [0.0, 0.1]
+    bin_size: [640]
+    clip_value_low: [0.0]
+    normalize_degrees: [True]
+  # Tune schedule specific parameters
+  asha:
+    max_t: 10
+    reduction_factor: 3
+    brackets: 1
+    grace_period: 5
+  hyperband:
+    max_t: 10
+    reduction_factor: 3

From 99c8814951700584e4f7d162f7742e7a4a55792f Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 5 Aug 2021 15:13:56 +0200
Subject: [PATCH 040/157] feat: Add expdecay_decay_steps and layernorm to
 raytune params

---
 mlpf/pipeline.py                       | 27 ++++++++++++++++----------
 parameters/cms-gnn-dense-onecycle.yaml |  2 ++
 parameters/cms-gnn-dense.yaml          |  2 ++
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index c30362e2e..a1aadb0f1 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -368,6 +368,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
 
 
 def set_raytune_search_parameters(search_space, config):
+    config["parameters"]["layernorm"] = search_space["layernorm"]
     config["parameters"]["hidden_dim"] = search_space["hidden_dim"]
     config["parameters"]["distance_dim"] = search_space["distance_dim"]
     config["parameters"]["num_conv"] = search_space["num_conv"]
@@ -378,6 +379,8 @@ def set_raytune_search_parameters(search_space, config):
     config["parameters"]["normalize_degrees"] = search_space["normalize_degrees"]
 
     config["setup"]["lr"] = search_space["lr"]
+
+    config["exponentialdecay"]["decay_steps"] = search_space["expdecay_decay_steps"]
     return config
 
 
@@ -476,16 +479,20 @@ def raytune(config, name, local, cpus, gpus):
     config_file_path = config
 
     search_space = {
-        "lr": tune.grid_search([1e-4]),
-
-        "hidden_dim": tune.grid_search([256]),
-        "distance_dim": tune.grid_search([128]),
-        "num_conv": tune.grid_search([2, 3]),
-        "num_gsl": tune.grid_search([2]),
-        "dropout": tune.grid_search([0.0, 0.1]),
-        "bin_size": tune.grid_search([640]),
-        "clip_value_low": tune.grid_search([0.0]),
-        "normalize_degrees": tune.grid_search([True]),
+        # Optimizer parameters
+        "lr": tune.grid_search(cfg["raytune"]["parameters"]["lr"]),
+        "expdecay_decay_steps": tune.grid_search(cfg["raytune"]["parameters"]["expdecay_decay_steps"]),
+
+        # Model parameters
+        "layernorm": tune.grid_search(cfg["raytune"]["parameters"]["layernorm"]),
+        "hidden_dim": tune.grid_search(cfg["raytune"]["parameters"]["hidden_dim"]),
+        "distance_dim": tune.grid_search(cfg["raytune"]["parameters"]["distance_dim"]),
+        "num_conv": tune.grid_search(cfg["raytune"]["parameters"]["num_conv"]),
+        "num_gsl": tune.grid_search(cfg["raytune"]["parameters"]["num_gsl"]),
+        "dropout": tune.grid_search(cfg["raytune"]["parameters"]["dropout"]),
+        "bin_size": tune.grid_search(cfg["raytune"]["parameters"]["bin_size"]),
+        "clip_value_low": tune.grid_search(cfg["raytune"]["parameters"]["clip_value_low"]),
+        "normalize_degrees": tune.grid_search(cfg["raytune"]["parameters"]["normalize_degrees"]),
     }
 
     sched = get_raytune_schedule(cfg["raytune"])
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index a003d6952..9b3ae06e9 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -139,7 +139,9 @@ raytune:
   parameters:
     # optimizer parameters
     lr: [1e-4]
+    expdecay_decay_steps: [10000]
     # model parameters
+    layernorm: [False]
     hidden_dim: [256]
     distance_dim: [128, 256]
     num_conv: [2, 3, 4]
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index d05f2d6b5..13a282d40 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -130,7 +130,9 @@ raytune:
   parameters:
     # optimizer parameters
     lr: [1e-4]
+    expdecay_decay_steps: [10000]
     # model parameters
+    layernorm: [False]
     hidden_dim: [256]
     distance_dim: [128, 256]
     num_conv: [2, 3, 4]

From dd2a60d93871fcdb9d46ce10a11777b359761218 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 5 Aug 2021 16:09:28 +0200
Subject: [PATCH 041/157] feat: Add batch_size to raytune hyperparameters

---
 mlpf/pipeline.py                       | 2 ++
 parameters/cms-gnn-dense-onecycle.yaml | 1 +
 parameters/cms-gnn-dense.yaml          | 1 +
 3 files changed, 4 insertions(+)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index a1aadb0f1..ff0d94172 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -379,6 +379,7 @@ def set_raytune_search_parameters(search_space, config):
     config["parameters"]["normalize_degrees"] = search_space["normalize_degrees"]
 
     config["setup"]["lr"] = search_space["lr"]
+    config["setup"]["batch_size"] = search_space["batch_size"]
 
     config["exponentialdecay"]["decay_steps"] = search_space["expdecay_decay_steps"]
     return config
@@ -481,6 +482,7 @@ def raytune(config, name, local, cpus, gpus):
     search_space = {
         # Optimizer parameters
         "lr": tune.grid_search(cfg["raytune"]["parameters"]["lr"]),
+        "batch_size": tune.grid_search(cfg["raytune"]["parameters"]["batch_size"]),
         "expdecay_decay_steps": tune.grid_search(cfg["raytune"]["parameters"]["expdecay_decay_steps"]),
 
         # Model parameters
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 9b3ae06e9..a5cdf21f6 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -139,6 +139,7 @@ raytune:
   parameters:
     # optimizer parameters
     lr: [1e-4]
+    batch_size: [32]
     expdecay_decay_steps: [10000]
     # model parameters
     layernorm: [False]
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 13a282d40..7a9d53ad6 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -130,6 +130,7 @@ raytune:
   parameters:
     # optimizer parameters
     lr: [1e-4]
+    batch_size: [32]
     expdecay_decay_steps: [10000]
     # model parameters
     layernorm: [False]

From 0fa17fd05e9e6b85b364f7f93548c00451b0d6e1 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 12 Aug 2021 12:52:02 +0300
Subject: [PATCH 042/157] updates

---
 mlpf/pipeline.py                  |  4 +--
 mlpf/tfmodel/lr_finder.py         |  2 +-
 mlpf/tfmodel/model.py             | 44 ++++++++++++++++++++++++-------
 mlpf/tfmodel/model_setup.py       | 27 ++++++++++++++++---
 notebooks/cms-mlpf.ipynb          | 24 ++++++++++++++---
 parameters/cms-gnn-dense-dev.yaml |  6 +++--
 parameters/cms-gnn-dense.yaml     | 20 +++++++-------
 parameters/test-cms-v2.yaml       |  3 +++
 8 files changed, 100 insertions(+), 30 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 2a7e6335a..ad14b6786 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -143,8 +143,8 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         callbacks = prepare_callbacks(
             model,
             outdir,
-            X_val[: config["setup"]["batch_size"]],
-            ycand_val[: config["setup"]["batch_size"]],
+            X_val,
+            ycand_val,
             dataset_transform,
             config["dataset"]["num_output_classes"],
         )
diff --git a/mlpf/tfmodel/lr_finder.py b/mlpf/tfmodel/lr_finder.py
index 152b69417..4b2338581 100644
--- a/mlpf/tfmodel/lr_finder.py
+++ b/mlpf/tfmodel/lr_finder.py
@@ -14,7 +14,7 @@ class LRFinder(Callback):
     paper: https://arxiv.org/pdf/1803.09820.pdf.
     """
 
-    def __init__(self, start_lr: float = 1e-7, end_lr: float = 3, max_steps: int = 200, smoothing=0.9):
+    def __init__(self, start_lr: float = 1e-7, end_lr: float = 1e-2, max_steps: int = 200, smoothing=0.9):
         super(LRFinder, self).__init__()
         self.start_lr, self.end_lr = start_lr, end_lr
         self.max_steps = max_steps
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 4d5ff8154..6db611f9c 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -10,7 +10,7 @@
 import numpy as np
 from numpy.lib.recfunctions import append_fields
 
-regularizer_weight = 1e-8
+regularizer_weight = 0.0
 
 def split_indices_to_bins(cmul, nbins, bin_size):
     bin_idx = tf.argmax(cmul, axis=-1)
@@ -651,11 +651,11 @@ def call(self, inputs, training=True):
             ret = {
                 "cls": out_id_softmax,
                 "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
+                "pt": tf.exp(tf.clip_by_value(pred_momentum[:, :, 0:1], -4, 4)),
                 "eta": pred_momentum[:, :, 1:2],
                 "sin_phi": pred_momentum[:, :, 2:3],
                 "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5]
+                "energy": tf.exp(tf.clip_by_value(pred_momentum[:, :, 4:5], -5, 6))
             }
             if self.return_matrix:
                 ret["dm"] = dm
@@ -948,6 +948,8 @@ def __init__(self,
             input_encoding="cms",
             focal_loss_from_logits=False,
             graph_kernel="gaussian",
+            skip_connection=False,
+            regression_use_classification=True,
             debug=False
         ):
         super(PFNetDense, self).__init__()
@@ -959,6 +961,9 @@ def __init__(self,
         self.focal_loss_from_logits = focal_loss_from_logits
         self.debug = debug
 
+        self.skip_connection = skip_connection
+        self.regression_use_classification = regression_use_classification
+
         self.num_conv = num_conv
         self.num_gsl = num_gsl
 
@@ -1026,7 +1031,16 @@ def call(self, inputs, training=False):
                 debugging_data[cg.name] = enc_reg_all
             encs_reg.append(enc_reg)
 
-        dec_output_id = tf.concat([enc] + encs_id, axis=-1)
+        dec_input_cls = []
+        if self.skip_connection:
+            dec_input_cls.append(enc)
+        dec_input_cls += encs_id
+
+        graph_sum = tf.reduce_sum(encs_id[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
+        graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
+        dec_input_cls.append(graph_sum)
+
+        dec_output_id = tf.concat(dec_input_cls, axis=-1)
 
         out_id_logits = self.ffn_id(dec_output_id)*msk_input
 
@@ -1036,8 +1050,20 @@ def call(self, inputs, training=False):
             out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
 
         out_charge = self.ffn_charge(dec_output_id)*msk_input
-        dec_output_reg = tf.concat([enc, tf.cast(out_id_logits, X.dtype)] + encs_reg, axis=-1)
-       
+
+        dec_input_reg = []
+        if self.skip_connection:
+            dec_input_reg.append(enc)
+        if self.regression_use_classification:
+            dec_input_reg.append(tf.cast(out_id_logits, X.dtype))
+        dec_input_reg += encs_reg
+
+        graph_sum = tf.reduce_sum(encs_reg[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
+        graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
+        dec_input_reg.append(graph_sum)
+
+        dec_output_reg = tf.concat(dec_input_reg, axis=-1)
+
         if self.separate_momentum:
             pred_momentum = [ffn(dec_output_reg) for ffn in self.ffn_momentum]
             pred_momentum = tf.concat(pred_momentum, axis=-1)*msk_input
@@ -1050,11 +1076,11 @@ def call(self, inputs, training=False):
             ret = {
                 "cls": out_id_softmax,
                 "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
-                "eta": pred_momentum[:, :, 1:2],
+                "pt": X[:, :, 1:2] + pred_momentum[:, :, 0:1],
+                "eta": X[:, :, 2:3] + pred_momentum[:, :, 1:2],
                 "sin_phi": pred_momentum[:, :, 2:3],
                 "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5],
+                "energy": X[:, :, 4:5] + pred_momentum[:, :, 4:5],
             }
             if self.debug:
                 for k in debugging_data.keys():
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 74ae626c1..87e4ec9c2 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -61,7 +61,6 @@ class CustomCallback(tf.keras.callbacks.Callback):
     def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
         super(CustomCallback, self).__init__()
         self.X = X
-
         self.y = y
 
         #transform the prediction target from an array into a dictionary for easier access
@@ -163,7 +162,7 @@ def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
         plt.close("all")
 
     def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
-        vals_pred = ypred[reg_variable].numpy()[msk][ypred_id[msk]==icls].flatten()
+        vals_pred = ypred[reg_variable][msk][ypred_id[msk]==icls].flatten()
         vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]==icls].flatten()
 
         bins = self.reg_bins[reg_variable]
@@ -181,6 +180,23 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variabl
         plt.savefig(str(outpath / "{}_cls{}.pdf".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
+    def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
+        sel = (ypred_id[msk]==icls) & (self.ytrue_id[msk]==icls)
+        vals_pred = ypred[reg_variable][msk][sel].flatten()
+        vals_true = self.ytrue[reg_variable][msk][sel].flatten()
+
+        plt.scatter(vals_pred, vals_true, marker=".")
+        if len(vals_true) > 0:
+            minval = np.min(vals_true)
+            maxval = np.max(vals_true)
+            plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
+
+        plt.xlabel("predicted")
+        plt.ylabel("true")
+        plt.title(reg_variable)
+        plt.savefig(str(outpath / "{}_cls{}_corr.pdf".format(reg_variable, icls)), bbox_inches="tight")
+        plt.close("all")
+
     def on_epoch_end(self, epoch, logs=None):
 
         #save the training logs (losses) for this epoch
@@ -191,7 +207,7 @@ def on_epoch_end(self, epoch, logs=None):
         cp_dir.mkdir(parents=True, exist_ok=True)
 
         #run the model inference on the small validation dataset
-        ypred = self.model(self.X, training=False)
+        ypred = self.model.predict(self.X, batch_size=1)
 
         #choose the class with the highest probability as the prediction
         #this is a shortcut, in actual inference, we may want to apply additional per-class thresholds        
@@ -209,6 +225,7 @@ def on_epoch_end(self, epoch, logs=None):
             cp_dir_cls.mkdir(parents=True, exist_ok=True)
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
                 self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
+                self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
@@ -325,6 +342,8 @@ def make_gnn_dense(config, dtype):
         "separate_momentum",
         "input_encoding",
         "graph_kernel",
+        "skip_connection",
+        "regression_use_classification",
         "debug"
     ]
 
@@ -738,7 +757,7 @@ def main(args, yaml_path, config):
             if args.action=="train":
                 #file_writer_cm = tf.summary.create_file_writer(outdir + '/val_extra')
                 callbacks = prepare_callbacks(
-                    model, outdir, X_val[:config['setup']['batch_size']], ycand_val[:config['setup']['batch_size']],
+                    model, outdir, X_val, ycand_val,
                     dataset_transform, config["dataset"]["num_output_classes"]
                 )
                 callbacks.append(optim_callbacks)
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index 9b0cf0907..9271963ba 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -124,7 +124,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms-gnn-dense-a301aa09.gpu0.local/\""
+    "path = \"../experiments/cms-gnn-dense-dev_20210805_123408.gpu0.local/evaluation/\""
    ]
   },
   {
@@ -138,7 +138,7 @@
     "ycands = []\n",
     "ypreds = []\n",
     "ypreds_raw = []\n",
-    "for fi in glob.glob(path + \"/pred_batch*.npz\")[:100]:\n",
+    "for fi in glob.glob(path + \"/pred_batch*.npz\"):\n",
     "    dd = np.load(fi)\n",
     "    Xs.append(dd[\"X\"])\n",
     "    ygens.append(dd[\"ygen\"])\n",
@@ -162,6 +162,24 @@
     "ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.std(ycand_f[ycand_f[:, 0]!=0, 4])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(np.log(ycand_f[ycand_f[:, 0]!=0, 6]), bins=100);"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -287,7 +305,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "history = load_history(path + \"/history_*.json\")"
+    "history = load_history(path + \"/../history/history_*.json\")"
    ]
   },
   {
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index dedba952a..7b9f1c1a9 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -47,7 +47,7 @@ setup:
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 500
-  num_val_files: 100
+  num_val_files: 10
   dtype: float32
   trainable: all
   classification_loss_type: categorical_cross_entropy
@@ -73,10 +73,12 @@ parameters:
   num_gsl: 2
   normalize_degrees: yes
   distance_dim: 8
-  dropout: 0.2
+  dropout: 0.0
   separate_momentum: yes
   input_encoding: cms
   graph_kernel: learnable #gaussian, learnable
+  skip_connection: yes
+  regression_use_classification: yes
   debug: no
 
 timing:
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 5b16bc050..164c34571 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -43,11 +43,11 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 5
+  batch_size: 1
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 500
-  num_val_files: 100
+  num_val_files: 10
   dtype: float32
   trainable: all
   classification_loss_type: categorical_cross_entropy
@@ -66,17 +66,19 @@ parameters:
   model: gnn_dense
   activation: elu
   layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.01
-  num_conv: 2
-  num_gsl: 2
+  hidden_dim: 512
+  bin_size: 3200
+  clip_value_low: 0.00
+  num_conv: 3
+  num_gsl: 1
   normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.2
+  distance_dim: 512
+  dropout: 0.0
   separate_momentum: yes
   input_encoding: cms
   graph_kernel: gaussian #gaussian, learnable
+  skip_connection: yes
+  regression_use_classification: yes
   debug: no
 
 timing:
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 3b14e661a..23569e4ee 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -63,6 +63,9 @@ parameters:
   dropout: 0.0
   separate_momentum: yes
   input_encoding: cms
+  graph_kernel: gaussian #gaussian, learnable
+  skip_connection: yes
+  regression_use_classification: yes
   debug: no
 
 timing:

From 2c0210d2ff9dcc6d562eecd4a0067cef69ab465a Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Fri, 13 Aug 2021 11:42:29 +0300
Subject: [PATCH 043/157] remove additive momentum

---
 mlpf/tfmodel/model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 6db611f9c..8a52330a3 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -1076,11 +1076,11 @@ def call(self, inputs, training=False):
             ret = {
                 "cls": out_id_softmax,
                 "charge": out_charge,
-                "pt": X[:, :, 1:2] + pred_momentum[:, :, 0:1],
-                "eta": X[:, :, 2:3] + pred_momentum[:, :, 1:2],
+                "pt": pred_momentum[:, :, 0:1],
+                "eta": pred_momentum[:, :, 1:2],
                 "sin_phi": pred_momentum[:, :, 2:3],
                 "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": X[:, :, 4:5] + pred_momentum[:, :, 4:5],
+                "energy": pred_momentum[:, :, 4:5],
             }
             if self.debug:
                 for k in debugging_data.keys():

From 127d4e126aaa1ce82b0703d558feada6986e4a15 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Sun, 15 Aug 2021 17:50:36 +0300
Subject: [PATCH 044/157] up

---
 mlpf/tallinn/cms-gnn-dense-dev.sh | 2 +-
 mlpf/tfmodel/model.py             | 2 +-
 parameters/cms-gnn-dense-dev.yaml | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlpf/tallinn/cms-gnn-dense-dev.sh b/mlpf/tallinn/cms-gnn-dense-dev.sh
index f189465cf..7d1eda4a8 100755
--- a/mlpf/tallinn/cms-gnn-dense-dev.sh
+++ b/mlpf/tallinn/cms-gnn-dense-dev.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH -p gpu
-#SBATCH --gpus 5
+#SBATCH --gpus 2
 #SBATCH --mem-per-gpu=8G
 
 IMG=/home/software/singularity/base.simg:latest
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 8a52330a3..d69a7e3c9 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -10,7 +10,7 @@
 import numpy as np
 from numpy.lib.recfunctions import append_fields
 
-regularizer_weight = 0.0
+regularizer_weight = 1e-9
 
 def split_indices_to_bins(cmul, nbins, bin_size):
     bin_idx = tf.argmax(cmul, axis=-1)
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 7b9f1c1a9..b1d4016f3 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -42,7 +42,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 5e-4
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
@@ -68,11 +68,11 @@ parameters:
   layernorm: no
   hidden_dim: 256
   bin_size: 40
-  clip_value_low: 0.0
+  clip_value_low: 0.00
   num_conv: 2
   num_gsl: 2
   normalize_degrees: yes
-  distance_dim: 8
+  distance_dim: 16
   dropout: 0.0
   separate_momentum: yes
   input_encoding: cms

From 308aad3b834c3169787d5767a44558e97e0c80b7 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Mon, 16 Aug 2021 21:14:54 +0300
Subject: [PATCH 045/157] update adv training

---
 mlpf/adv_training.py        | 127 ++++++++++++++++++++++--------------
 mlpf/tfmodel/model_setup.py |   6 +-
 2 files changed, 84 insertions(+), 49 deletions(-)

diff --git a/mlpf/adv_training.py b/mlpf/adv_training.py
index b63474906..1b077b892 100644
--- a/mlpf/adv_training.py
+++ b/mlpf/adv_training.py
@@ -4,31 +4,49 @@
 import glob
 import random
 
+from tqdm import tqdm
 from tfmodel.model_setup import make_model, targets_multi_output, CustomCallback
 from tfmodel.data import Dataset
 
+#A deep sets conditional discriminator
 def make_disc_model(config, reco_features):
     input_elems = tf.keras.layers.Input(shape=(config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
     input_reco = tf.keras.layers.Input(shape=(config["dataset"]["padded_num_elem_size"], reco_features))
-    da1 = tf.keras.layers.Dense(128, activation="elu")(input_elems)
-    da2 = tf.keras.layers.Dense(128, activation="elu")(da1)
-    da3 = tf.keras.layers.Dense(128, activation="elu")(da2)
-    db1 = tf.keras.layers.Dense(128, activation="elu")(input_reco)
-    db2 = tf.keras.layers.Dense(128, activation="elu")(db1)
-    db3 = tf.keras.layers.Dense(128, activation="elu")(db2)
+
+    nhidden = 512
+    #process the input elements
+    da1 = tf.keras.layers.Dense(nhidden, activation="elu")(input_elems)
+    da2 = tf.keras.layers.Dense(nhidden, activation="elu")(da1)
+    da3 = tf.keras.layers.Dense(nhidden, activation="elu")(da2)
+
+    #process the target reco particles
+    db1 = tf.keras.layers.Dense(nhidden, activation="elu")(input_reco)
+    db2 = tf.keras.layers.Dense(nhidden, activation="elu")(db1)
+    db3 = tf.keras.layers.Dense(nhidden, activation="elu")(db2)
+
+    #concatenate the input element and reco target 
     c = tf.keras.layers.Concatenate()([da3, db3])
-    dc1 = tf.keras.layers.Dense(128, activation="elu")(c)
-    dc2 = tf.keras.layers.Dense(128, activation="elu")(dc1)
-
-    sc = tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x, axis=-2))(dc2)
-    c1 = tf.keras.layers.Dense(128, activation="elu")(sc)
-    c2 = tf.keras.layers.Dense(128, activation="elu")(c1)
-    c3 = tf.keras.layers.Dense(1, activation="linear")(c2)
-    model_disc = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[c3])
+
+    #encode the (element, target) pairs using a feedforward net
+    dc1 = tf.keras.layers.Dense(nhidden, activation="elu")(c)
+    dc2 = tf.keras.layers.Dense(nhidden/2, activation="elu")(dc1)
+
+    #sum across the encoded (element, target) pairs in the event to create an event encoding
+    msk = tf.keras.layers.Lambda(lambda x: tf.cast(x[:, :, 0:1]!=0, tf.float32))(input_elems)
+    sc = tf.keras.layers.Lambda(lambda args: tf.reduce_sum(args[0]*args[1], axis=-2))([dc2, msk])
+
+    #classify the embedded event as real (true target) or fake (MLPF reconstructed)
+    c1 = tf.keras.layers.Dense(nhidden/2, activation="elu")(sc)
+    c2 = tf.keras.layers.Dense(nhidden/4, activation="elu")(c1)
+    c3 = tf.keras.layers.Dense(nhidden/8, activation="elu")(c2)
+
+    #classification output
+    c4 = tf.keras.layers.Dense(1, activation="linear")(c3)
+    model_disc = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[c4])
     return model_disc 
 
 def concat_pf(ypred):
-    return tf.concat([ypred["cls"], ypred["charge"], ypred["pt"], ypred["eta"], ypred["sin_phi"], ypred["cos_phi"], ypred["energy"]], axis=-1)
+    return tf.concat([tf.keras.activations.softmax(ypred["cls"]*10), ypred["charge"], ypred["pt"], ypred["eta"], ypred["sin_phi"], ypred["cos_phi"], ypred["energy"]], axis=-1)
 
 def main(config):
     tf.config.run_functions_eagerly(False)
@@ -51,10 +69,9 @@ def main(config):
 
     x = np.random.randn(1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])
     ypred = concat_pf(model_pf(x))
-    model_pf.load_weights("./experiments/cms-gnn-dense-lite-e0108f63.gpu0.local/weights-254-41.512947.hdf5")
+    model_pf.load_weights("./experiments/cms-gnn-dense-dev_20210814_123346.gpu0.local/weights/weights-65-198.290390.hdf5")
 
     model_disc = make_disc_model(config, ypred.shape[-1])
-    model_disc.summary()
 
     cds = config["dataset"]
     dataset_def = Dataset(
@@ -78,7 +95,7 @@ def main(config):
     ycand_val = np.concatenate(ycands)
 
     dataset_transform = targets_multi_output(config['dataset']['num_output_classes'])
-    cb = CustomCallback("logs", X_val, ycand_val, dataset_transform, config['dataset']['num_output_classes'])
+    cb = CustomCallback("logs", X_val, ycand_val, dataset_transform, config['dataset']['num_output_classes'], freq=1)
     cb.set_model(model_pf)
 
     tfr_files = sorted(glob.glob(dataset_def.processed_path))
@@ -91,9 +108,9 @@ def main(config):
         tf.TensorShape([dataset_def.padded_num_elem_size, ])
     )
 
-    n_train = 1000
-    n_test = 1000
-    batch_size = 2
+    n_train = 500
+    n_test = 500
+    batch_size = 4
 
     ds_train = dataset.take(n_train).padded_batch(batch_size, padded_shapes=ps)
     ds_test = dataset.skip(n_train).take(n_test).padded_batch(batch_size, padded_shapes=ps)
@@ -109,45 +126,51 @@ def main(config):
     m1 = tf.keras.models.Model(inputs=[input_elems], outputs=[disc_out1])
     m2 = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[disc_out2])
 
-    optimizer1 = tf.keras.optimizers.Adam(lr=1e-5)
+    def loss(x,y):
+        return tf.keras.losses.binary_crossentropy(x,y, from_logits=True)
+
+    #The MLPF reconstruction model (generator) is optimized to confuse the classifier
+    optimizer1 = tf.keras.optimizers.Adam(lr=1e-6)
     model_pf.trainable = True
     model_disc.trainable = False
-    m1.compile(loss=lambda x,y: tf.keras.losses.binary_crossentropy(x,y, from_logits=True), optimizer=optimizer1)
+    m1.compile(loss=loss, optimizer=optimizer1)
+    m1.summary()
 
-    optimizer2 = tf.keras.optimizers.Adam(lr=1e-5)
+    #The discriminator model (adversarial) is optimized to distinguish between the true target and MLPF-reconstructed events
+    optimizer2 = tf.keras.optimizers.Adam(lr=1e-6)
     model_pf.trainable = False
     model_disc.trainable = True
-    m2.compile(loss=lambda x,y: tf.keras.losses.binary_crossentropy(x,y, from_logits=True), optimizer=optimizer2)
+    m2.compile(loss=loss, optimizer=optimizer2)
+    m2.summary()
 
-    epochs = 100
-    from itertools import cycle
-    disc_or_pf = 2*[True] + 8*[False]
-    disc_or_pf = cycle(disc_or_pf)
+    epochs = 1000
 
     for epoch in range(epochs):
-
         loss_tot1 = 0.0
         loss_tot2 = 0.0
         loss_tot1_test = 0.0
         loss_tot2_test = 0.0
 
-        train_disc = next(disc_or_pf)
-        for step, (xb, yb, _) in enumerate(ds_train):
+
+        for step, (xb, yb, wb) in tqdm(enumerate(ds_train), desc="Training"):
 
             yp = concat_pf(model_pf(xb))
-            xb = tf.concat([xb, xb], axis=0)
             yid = tf.one_hot(tf.cast(yb[:, :, 0], tf.int32), cds["num_output_classes"])
             yb = tf.concat([yid, yb[:, :, 1:]], axis=-1)
             yb = tf.concat([yb, yp], axis=0)
+
+            #Train the MLPF reconstruction (generative) model with an inverted target
+            yt = tf.concat([batch_size*[1]], axis=0)
+            loss1 = m1.train_on_batch(xb, yt)
+
+            xb = tf.concat([xb, xb], axis=0)
+            #Train the discriminative (adversarial) model
+            #true target particles have a classification target of 1, MLPF reconstructed a target of 0
             yt = tf.concat([batch_size*[1], batch_size*[0]], axis=0)
+            loss2 = m2.train_on_batch([xb, yb], yt)
 
-            if train_disc:
-                loss_tot1 += m1.test_on_batch(xb, yt)
-                loss_tot2 += m2.train_on_batch([xb, yb], yt)
-            else:
-                yt = tf.concat([batch_size*[1], batch_size*[1]], axis=0)
-                loss_tot1 += m1.train_on_batch(xb, yt)
-                loss_tot2 += m2.test_on_batch([xb, yb], yt)
+            loss_tot1 += loss1
+            loss_tot2 += loss2
 
         import boost_histogram as bh
         import mplhep
@@ -156,23 +179,29 @@ def main(config):
         preds_0 = []
         preds_1 = []
 
-        for step, (xb, yb, _) in enumerate(ds_test):
+        for step, (xb, yb, wb) in tqdm(enumerate(ds_test), desc="Testing"):
             yp = concat_pf(model_pf(xb))
-            xb = tf.concat([xb, xb], axis=0)
             yid = tf.one_hot(tf.cast(yb[:, :, 0], tf.int32), cds["num_output_classes"])
             yb = tf.concat([yid, yb[:, :, 1:]], axis=-1)
             yb = tf.concat([yb, yp], axis=0)
-            yt = tf.concat([batch_size*[1], batch_size*[0]], axis=0)
 
-            loss_tot1_test += m1.test_on_batch(xb, yt)
-            loss_tot2_test += m2.test_on_batch([xb, yb], yt)
+            yt = tf.concat([batch_size*[1]], axis=0)
+            loss1 = m1.test_on_batch(xb, yt)
+
+            xb = tf.concat([xb, xb], axis=0)
+            yt = tf.concat([batch_size*[1], batch_size*[0]], axis=0)
+            loss2 = m2.test_on_batch([xb, yb], yt)
 
             p = m2.predict_on_batch([xb, yb])
             preds_0 += list(p[yt==0, 0])
             preds_1 += list(p[yt==1, 0])
 
+            loss_tot1_test += loss1
+            loss_tot2_test += loss2
+
         print("Epoch {}, l1={:.5E}/{:.5E}, l2={:.5E}/{:.5E}".format(epoch, loss_tot1, loss_tot1_test, loss_tot2, loss_tot2_test))
 
+        #Draw histograms of the discriminator outputs for monitoring
         minval = np.min(preds_0 + preds_1)
         maxval = np.max(preds_0 + preds_1)
         h0 = bh.Histogram(bh.axis.Regular(50, minval, maxval))
@@ -181,8 +210,10 @@ def main(config):
         h1.fill(preds_1)
 
         fig = plt.figure(figsize=(4,4))
-        mplhep.histplot(h0)
-        mplhep.histplot(h1)
+        mplhep.histplot(h0, label="MLPF")
+        mplhep.histplot(h1, label="Target")
+        plt.xlabel("Adversarial classification output")
+        plt.legend(loc="best", frameon=False)
         plt.savefig("logs/disc_{}.pdf".format(epoch), bbox_inches="tight")
         plt.close("all")
 
@@ -195,5 +226,5 @@ def main(config):
         cb.on_epoch_end(epoch)
 
 if __name__ == "__main__":
-    config = yaml.load(open("parameters/cms-gnn-dense-lite.yaml"))
+    config = yaml.load(open("parameters/cms-gnn-dense-dev.yaml"))
     main(config)
\ No newline at end of file
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 87e4ec9c2..365e37a99 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -58,10 +58,11 @@ def plot_to_image(figure):
     return image
 
 class CustomCallback(tf.keras.callbacks.Callback):
-    def __init__(self, outpath, X, y, dataset_transform, num_output_classes):
+    def __init__(self, outpath, X, y, dataset_transform, num_output_classes, freq=1):
         super(CustomCallback, self).__init__()
         self.X = X
         self.y = y
+        self.freq = freq
 
         #transform the prediction target from an array into a dictionary for easier access
         self.ytrue = dataset_transform(self.X, self.y, None)[1]
@@ -199,6 +200,9 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
 
     def on_epoch_end(self, epoch, logs=None):
 
+        if epoch%self.freq!=0:
+            return
+
         #save the training logs (losses) for this epoch
         with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
             json.dump(logs, fi)

From 69efa1e91c6fad9d8c16bd6bbbbdfa8862cfc18f Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 17 Aug 2021 21:27:18 +0300
Subject: [PATCH 046/157] up

---
 mlpf/adv_training.py              |   8 +--
 mlpf/tfmodel/model.py             |  44 +++++++------
 mlpf/tfmodel/model_setup.py       |  26 +++++---
 notebooks/pfnet-debug.ipynb       | 106 +++++++++++++++++++++++++++---
 parameters/cms-gnn-dense-dev.yaml |  14 ++--
 5 files changed, 151 insertions(+), 47 deletions(-)

diff --git a/mlpf/adv_training.py b/mlpf/adv_training.py
index 1b077b892..940e067d9 100644
--- a/mlpf/adv_training.py
+++ b/mlpf/adv_training.py
@@ -69,7 +69,7 @@ def main(config):
 
     x = np.random.randn(1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])
     ypred = concat_pf(model_pf(x))
-    model_pf.load_weights("./experiments/cms-gnn-dense-dev_20210814_123346.gpu0.local/weights/weights-65-198.290390.hdf5")
+    model_pf.load_weights("./experiments/cms-gnn-dense-dev_20210814_123346.gpu0.local/weights/weights-100-195.351807.hdf5")
 
     model_disc = make_disc_model(config, ypred.shape[-1])
 
@@ -108,8 +108,8 @@ def main(config):
         tf.TensorShape([dataset_def.padded_num_elem_size, ])
     )
 
-    n_train = 500
-    n_test = 500
+    n_train = 10000
+    n_test = 1000
     batch_size = 4
 
     ds_train = dataset.take(n_train).padded_batch(batch_size, padded_shapes=ps)
@@ -137,7 +137,7 @@ def loss(x,y):
     m1.summary()
 
     #The discriminator model (adversarial) is optimized to distinguish between the true target and MLPF-reconstructed events
-    optimizer2 = tf.keras.optimizers.Adam(lr=1e-6)
+    optimizer2 = tf.keras.optimizers.Adam(lr=1e-4)
     model_pf.trainable = False
     model_disc.trainable = True
     m2.compile(loss=loss, optimizer=optimizer2)
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index d69a7e3c9..a04ab1f70 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -992,21 +992,23 @@ def __init__(self,
         self.cg_id = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
         self.cg_reg = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
 
-        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=3, activation=activation)
-        self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=3, activation=activation)
+        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=4, activation=activation)
+        self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=2, activation=activation)
         
         if self.separate_momentum:
             self.ffn_momentum = [
                 point_wise_feed_forward_network(
                     1, dff, name="ffn_momentum{}".format(imomentum),
-                    dtype=tf.dtypes.float32, num_layers=3, activation=activation
+                    dtype=tf.dtypes.float32, num_layers=4, activation=activation
                 ) for imomentum in range(num_momentum_outputs)
             ]
         else:
-            self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, name="ffn_momentum", dtype=tf.dtypes.float32, num_layers=3, activation=activation)
+            self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, name="ffn_momentum", dtype=tf.dtypes.float32, num_layers=4, activation=activation)
 
     def call(self, inputs, training=False):
         X = inputs
+
+        #mask padded elements
         msk = X[:, :, 0] != 0
         msk_input = tf.expand_dims(tf.cast(msk, tf.float32), -1)
 
@@ -1015,6 +1017,8 @@ def call(self, inputs, training=False):
         encs_id = []
 
         debugging_data = {}
+
+        #encode the elements for classification (id)
         for cg in self.cg_id:
             enc_id_all = cg(enc_id, msk, training)
             enc_id = enc_id_all["enc"]
@@ -1022,6 +1026,7 @@ def call(self, inputs, training=False):
                 debugging_data[cg.name] = enc_id_all
             encs_id.append(enc_id)
 
+        #encode the elements for regression
         enc_reg = self.activation(self.ffn_enc_reg(enc))
         encs_reg = []
         for cg in self.cg_reg:
@@ -1040,7 +1045,7 @@ def call(self, inputs, training=False):
         graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
         dec_input_cls.append(graph_sum)
 
-        dec_output_id = tf.concat(dec_input_cls, axis=-1)
+        dec_output_id = tf.concat(dec_input_cls, axis=-1)*msk_input
 
         out_id_logits = self.ffn_id(dec_output_id)*msk_input
 
@@ -1062,7 +1067,7 @@ def call(self, inputs, training=False):
         graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
         dec_input_reg.append(graph_sum)
 
-        dec_output_reg = tf.concat(dec_input_reg, axis=-1)
+        dec_output_reg = tf.concat(dec_input_reg, axis=-1)*msk_input
 
         if self.separate_momentum:
             pred_momentum = [ffn(dec_output_reg) for ffn in self.ffn_momentum]
@@ -1072,22 +1077,23 @@ def call(self, inputs, training=False):
 
         out_charge = tf.clip_by_value(out_charge, -2, 2)
 
+        ret = {
+            "cls": out_id_softmax,
+            "charge": out_charge,
+            "pt": tf.exp(tf.clip_by_value(pred_momentum[:, :, 0:1], -6, 8)),
+            "eta": pred_momentum[:, :, 1:2],
+            "sin_phi": pred_momentum[:, :, 2:3],
+            "cos_phi": pred_momentum[:, :, 3:4],
+            "energy": tf.exp(tf.clip_by_value(pred_momentum[:, :, 4:5], -6, 8)),
+        }
+        if self.debug:
+            for k in debugging_data.keys():
+                ret[k] = debugging_data[k]
+
         if self.multi_output:
-            ret = {
-                "cls": out_id_softmax,
-                "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5],
-            }
-            if self.debug:
-                for k in debugging_data.keys():
-                    ret[k] = debugging_data[k]
             return ret
         else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
+            return tf.concat([ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1)
 
     def set_trainable_classification(self):
         self.trainable = True
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 365e37a99..1855bcf6a 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -159,12 +159,17 @@ def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        plt.savefig(str(outpath / "event_iev{}.pdf".format(ievent)), bbox_inches="tight")
+        plt.savefig(str(outpath / "event_iev{}.png".format(ievent)), bbox_inches="tight")
         plt.close("all")
 
     def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
-        vals_pred = ypred[reg_variable][msk][ypred_id[msk]==icls].flatten()
-        vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]==icls].flatten()
+
+        if icls==0:
+            vals_pred = ypred[reg_variable][msk][ypred_id[msk]!=icls].flatten()
+            vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]!=icls].flatten()
+        else:
+            vals_pred = ypred[reg_variable][msk][ypred_id[msk]==icls].flatten()
+            vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]==icls].flatten()
 
         bins = self.reg_bins[reg_variable]
         plt.hist(vals_true, bins=bins, histtype="step", lw=2, label="true")
@@ -178,15 +183,20 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variabl
         plt.ylabel("Number of particles")
         plt.legend(loc="best")
         plt.title("Regression output, cls {}".format(icls))
-        plt.savefig(str(outpath / "{}_cls{}.pdf".format(reg_variable, icls)), bbox_inches="tight")
+        plt.savefig(str(outpath / "{}_cls{}.png".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
     def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
-        sel = (ypred_id[msk]==icls) & (self.ytrue_id[msk]==icls)
+
+        if icls==0:
+            sel = self.ytrue_id[msk]!=icls
+        else:
+            sel = (ypred_id[msk]==icls) & (self.ytrue_id[msk]==icls)
+
         vals_pred = ypred[reg_variable][msk][sel].flatten()
         vals_true = self.ytrue[reg_variable][msk][sel].flatten()
 
-        plt.scatter(vals_pred, vals_true, marker=".")
+        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8)
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
@@ -195,7 +205,7 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
         plt.xlabel("predicted")
         plt.ylabel("true")
         plt.title(reg_variable)
-        plt.savefig(str(outpath / "{}_cls{}_corr.pdf".format(reg_variable, icls)), bbox_inches="tight")
+        plt.savefig(str(outpath / "{}_cls{}_corr.png".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
     def on_epoch_end(self, epoch, logs=None):
@@ -224,7 +234,7 @@ def on_epoch_end(self, epoch, logs=None):
         for ievent in range(min(5, self.X.shape[0])):
             self.plot_event_visualization(cp_dir, ypred, ypred_id, msk, ievent=ievent)
 
-        for icls in range(1, self.num_output_classes):
+        for icls in range(self.num_output_classes):
             cp_dir_cls = cp_dir / "cls_{}".format(icls)
             cp_dir_cls.mkdir(parents=True, exist_ok=True)
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 26cc46ee6..b0a64a127 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,11 +29,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<ipython-input-2-30d2deb1d9c5>:2: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
+      "  config = yaml.load(f)\n"
+     ]
+    }
+   ],
    "source": [
-    "with open(\"/home/joosep/particleflow/parameters/cms-gnn-dense.yaml\") as f:\n",
+    "with open(\"/home/joosep/particleflow/parameters/cms-gnn-dense-dev.yaml\") as f:\n",
     "    config = yaml.load(f)\n",
     "config[\"setup\"][\"multi_output\"] = True\n",
     "config[\"parameters\"][\"debug\"] = True"
@@ -41,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,9 +59,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "raw files: 0\n",
+      "val files: 1998\n"
+     ]
+    }
+   ],
    "source": [
     "cds = config[\"dataset\"]\n",
     "\n",
@@ -72,15 +90,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8201_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8202_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8203_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8204_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8205_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8206_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8207_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8208_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8209_0.pkl.bz2\n",
+      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_820_0.pkl.bz2\n"
+     ]
+    }
+   ],
    "source": [
     "Xs = []\n",
     "ygens = []\n",
     "ycands = []\n",
     "\n",
-    "for fi in dataset_def.val_filelist[:1]:\n",
+    "for fi in dataset_def.val_filelist[:10]:\n",
     "    print(fi)\n",
     "    X, ygen, ycand = dataset_def.prepare_data(fi)\n",
     "\n",
@@ -95,6 +130,57 @@
     "X_val, ycand_val, _ = dataset_transform(X_val, ycand_val, None)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vals_a = np.sin(X_val[:, :, 3].flatten())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vals_b = ycand_val[\"sin_phi\"][:, :, 0].flatten()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.collections.PathCollection at 0x7fac8067a280>"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD4CAYAAADhNOGaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAhrUlEQVR4nO3df4zc9X3n8ecriyGbVqkNuAQWjI3iOknrk92bg+gsJYQCJskJu5SC6aGYHpEvuaRVEsXKWkRtjiZiU6SQ3gVdYlES8uMCCU2crZzIBxguEhenrM8OBiKD+ZHghQQXcKQTLjXmfX/Md+Dr2Znd2fl+vzPzne/rIa125vtj5u3vjr/v+fxWRGBmZtX1hn4HYGZm/eVEYGZWcU4EZmYV50RgZlZxTgRmZhV3Qr8D6Mapp54aS5cu7XcYZmalsnv37n+OiMXN20uZCJYuXcrU1FS/wzAzKxVJv2i13VVDZmYV50RgZlZxTgRmZhXnRGBmVnFOBGZmFZdLryFJtwL/AXguIv6gxX4Bfwe8D3gJuCYi/m+ybyPw6eTQz0bEbXnElKdte6a5ccd+njl8hDMWjrJ57QrWrx7rd1hmNuQa957pw0dm7Htq4v25vU9e3Ue/BnwJ+Hqb/e8Flic/5wH/AzhP0snAXwM1IIDdkiYj4sWc4sps255ptnxvH0eOHgNg+vARtnxvH4CTgVkGZf+ClWf86dda+KYFRMDhI0dnPWfp+PbckkEuVUMR8WPghVkOWQd8Pep2AQslnQ6sBe6KiBeSm/9dwCV5xJSXG3fsfy0JNBw5eowbd+zvU0Rm5df4gjV9+AjB61+wtu2Z7ndoHckz/ubXevGlo3Mmgbz1qo1gDHg69fxgsq3d9hkkbZI0JWnq0KFDhQXa7JkWRbLZtpvZ3Mr+BSvP+Fu9Vq+VprE4IrZGRC0iaosXzxghXZgzFo7Oa7uZza3sX7DyjH8Q/s29SgTTwFmp52cm29ptL8S2PdOsmdjJsvHtrJnY2VExbvPaFYwuGDlu2+iCETavXVFUmGZDr+xfsDqJv9P7zSD8m3uVCCaBD6juncBvIuJZYAdwsaRFkhYBFyfbctdtnd761WPccNlKxhaOImBs4Sg3XLayVI1aZoOm7F+w5op/PvebVj2Cei2v7qPfBs4HTpV0kHpPoAUAEfFl4IfUu44eoN599M+TfS9I+hvggeSlro+I2RqduzZbnd5cN/X1q8d84zfLUeP/Uy97DeXZy2eu+Du53ywd357hXwMLRxdkOj8tl0QQEVfNsT+Aj7TZdytwax5xzKbsdZI2mMreBbKfevkFq4hu4LPFP9f9JmsSeAPwmUt/P9NrNL9eJZS9TtIGT9m7QFZJr3sptbuvBNmTgARfuHJVrkm0Momg33WS3TRU22AraxfIIj+Lg/o573WNQKv7TR5GF4xw0xX5JgEo6cI03ehHnWSDRycPpzJWNxb5WRzkz/kZC0dbNsrmVSPQqorwhstWtp0eYj7GFo4Wfs+qTCKA/jX6ZmmotsFV9M2lCEV+Fgf5c7557YrjkhTkVyPQLgHecNnKTEkgz7mE5lKZqqF+KuM3R5tbv6sbu1HkZ3GQP+dFdgNvlwA/dsferl+zl0kAKlYi6JcyfnMs2jD0tulndWO3ivwsDvrnvKgagTwTXa8TQINLBD1Qxm+ORRqm3jbrV49x//gFPDnxfu4fv2CgkwAU+1ms6uc8r0TXryQALhH0RBm/ORZpkOuSh12Rn8Wqfs6zNgb3MwE0qD7Wq1xqtVpMTU31Owzr0rLx7bT61Al4cgD+U5h1KuuYgF4nAUm7I6LWvN0lAuu5Qa9LNptL2RLAXNxGYD1X1bpkGw7DlgTAJQLrg6rWJVu5DWMCaHAisL7wjK5WJlmSwCAngAYnAjOzNoa5FJDmRFABwzB4y6zXsiaB0QUjbNszXYr/a04EQ26QJwIzG0RZE0BDmcbG5NJrSNIlkvZLOiBpvMX+myTtTX4elXQ4te9Yat9kHvHY68o6VbJZP+SVBBoGYZ6lTmQuEUgaAW4GLgIOAg9ImoyIRxrHRMTHU8f/BbA69RJHImJV1jistUGeCMxsUGRNAGMlHxuTR9XQucCBiHgCQNLtwDrgkTbHX0V9TWPrAQ/eMmsvr8bg5ipYKNfYmDyqhsaAp1PPDybbZpB0NrAM2Jna/EZJU5J2SVrf7k0kbUqOmzp06FAOYVeDB2+ZtZZnj6Aip7nuhV43Fm8A7oyIdKX12RExLekcYKekfRHxePOJEbEV2Ar1uYZ6E275efCW2fEyrxlMvUTd3COozGNj8kgE08BZqednJtta2QB8JL0hIqaT309Iuo96+8GMRGDdK/MH1CxPeTQGp6dOh+HofZdH1dADwHJJyySdSP1mP6P3j6S3AYuAn6S2LZJ0UvL4VGAN7dsWzMy6snR8e6YkMNaiTW2Yet9lTgQR8QrwUWAH8HPgOxHxsKTrJV2aOnQDcHscP+/124EpST8D7gUm0r2NzMyyypIAlv/ub/HUxPuHvvddLm0EEfFD4IdN2/6q6flnWpz3f4CVecRgZpaWZ2PwsPe+8zTUZjZ0sk4S1zxH0LD3vvMUE2Y2NIqaJG7Ye985EZjZUCh6quhh7n3nRGBmpVaVqaKL5ERgZqU17AvG9IoTgVmJeG2JOpcC8uVEYFYSXlvCCaAo7j5qVhJVX1vCSaA4LhGYlcSwj25txwmgeC4RmJVEu1GswzK6tRUngd5wicCsJDavXTGQi58U0YDtBNBbTgRmJTGIo1uLaMB2l9DecyKoKHdDLKdBG906WwP2fON0KaB/nAgqyN0QLS95NWC7FNBfTgQVlOe3OKu2bqdnbpRIW507H90mAZeIj+dEUEFV7YZo+eumAbu5RNqNLKUAl4hnyqX7qKRLJO2XdEDSeIv910g6JGlv8vPB1L6Nkh5LfjbmEY/NrordEK0Y61ePccNlKxlbOIqoL+l4w2UrZ72hfuyOvX1LAuCBea1kLhFIGgFuBi4CDgIPSJpsseTkHRHx0aZzTwb+GqhRXxN6d3Lui1njsvYGtRuilVOnDdif3raPb+76Zdfvk1dbgEvEM+VRNXQucCAingCQdDuwjs4WoV8L3BURLyTn3gVcAnw7h7isjUHshmjDbZB6BA37spPdyCMRjAFPp54fBM5rcdyfSHoX8Cjw8Yh4us25Le9GkjYBmwCWLFmSQ9jVNmjdEG04DVICaHCJeKZeTTHxj8DSiPg3wF3AbfN9gYjYGhG1iKgtXrw49wDNLF+DmASgu3aNYZdHiWAaOCv1/Mxk22si4vnU01uAv02de37TufflEJOZ9cmgJoA0l4iPl0eJ4AFguaRlkk4ENgCT6QMknZ56einw8+TxDuBiSYskLQIuTraZWQmVIQnYTJlLBBHxiqSPUr+BjwC3RsTDkq4HpiJiEvhLSZcCrwAvANck574g6W+oJxOA6xsNx2ZWvLwGVuWdADzgq7cUEf2OYd5qtVpMTU31OwyzUms1sGt0wci868vznh4ir7jaqXKSkbQ7ImrN2z2y2KxE8ryJZZ1qpKhqoCKnQPGo4tacCMxKIu+bWJaBVUVOElfkgC/Ps9WaVygzK4m8p0boZqqRpePbC58ptMgpUDyquDUnArOSyOsmtm3PNGsmdjJ9+Ahq2jfbwKqsCaDTHkGb165gdMFIx3HNh+fZas1VQ2YlkcfUCM3VSwEo+T3Wps2h111Ci5wCxaOKW3MiMCuJPG5iraqXGkng/vELjtvezzEBRQ348jxbrTkRmJVEHjexTquXhnlgmEcVz+REYFYiWW9ic1UvDXMCsPbcWGxWIbM1xDoJVJdLBGYV0qp6afrwET52x96uX9MJoPycCMwqJl29lKUU8OaTRnjwv16SV1jWR04EZhXkaiBLcyKYRZUnp7Lhkv4sZ5lm0glgODkRtOHJqWxYtJrNsxtOAsPLiaANT05lw6LVZ3k+nACGXy7dRyVdImm/pAOSxlvs/4SkRyQ9KOkeSWen9h2TtDf5mWw+t188OZUNg6Xj21uOG+iUk0A1ZC4RSBoBbgYuAg4CD0iajIhHUoftAWoR8ZKkD1Nfs/jKZN+RiFiVNY685TGvi1k/FT1LqA2PPKqGzgUORMQTAJJuB9YBryWCiLg3dfwu4Ooc3rdQnpzKyqosPYLcGWNw5JEIxoCnU88PAufNcvy1wI9Sz98oaYr6esYTEbGt1UmSNgGbAJYsWZIl3o54ciorm7IkAHBnjEHT08ZiSVcDNeDdqc1nR8S0pHOAnZL2RcTjzedGxFZgK9TXLO5FvJ6cysqiTEkA3Blj0OSRCKaBs1LPz0y2HUfShcB1wLsj4uXG9oiYTn4/Iek+YDUwIxGY2UxlSwAN7owxWPLoNfQAsFzSMkknAhuA43r/SFoNfAW4NCKeS21fJOmk5PGpwBpSbQtm1l5ZkwB4pbBBk7lEEBGvSPoosAMYAW6NiIclXQ9MRcQkcCPw28B3JQH8MiIuBd4OfEXSq9ST0kRTbyMza1LmBNDgzhiDRRE9qW7PVa1Wi6mpqX6HYdZzWZLA8t/9Le76xPn5BZORew31nqTdEVFr3u6RxWYlMAylgGbujDE4nAjMBpwHhlnRnAjMBtQwlgJsMDkRmA0glwKsl5wIzAaISwHWD04EZgPCpQDrFycCsz5zKcD6zYnArE/eumU7r2QYxuMEYHlxIjDrA5cCbJA4EZj1UNYE8MUrV3kQluXOicCsR1wKsEHlRGBWMCcAG3S5LF5vZq1lSQJXv3OJk4D1hEsEZgVwKcDKxCUCs5xlSQKjC0b44pWr8gvGrAMuEZjlJGspYMxz8luf5JIIJF0C/B31FcpuiYiJpv0nAV8H/i3wPHBlRDyV7NsCXAscA/4yInbkEZNZL3l6CCuzzIlA0ghwM3ARcBB4QNJk05KT1wIvRsRbJW0APg9cKekd1Nc4/n3gDOBuSb8XEccwKwG3BdgwyKNEcC5wICKeAJB0O7CO4xehXwd8Jnl8J/Al1RcvXgfcHhEvA09KOpC83k9yiMusUC4F2LDIIxGMAU+nnh8Ezmt3TLLY/W+AU5Ltu5rObVlBKmkTsAlgyZIlOYRt1h2XAmzYlKaxOCK2Aluhvnh9n8OxCnICsGGVR/fRaeCs1PMzk20tj5F0AvA71BuNOznXrO+cBGyY5VEieABYLmkZ9Zv4BuDPmo6ZBDZSr/u/HNgZESFpEvifkr5AvbF4OfBPOcRklgsnAKuCzIkgqfP/KLCDevfRWyPiYUnXA1MRMQn8PfCNpDH4BerJguS471BvWH4F+Ih7DNmgcBKwqlBE+arba7VaTE1N9TsMG1JOADasJO2OiFrz9tI0Fpv1gruEWhU5EZjhUoBVmxOBVZ5LAVZ1TgRWWS4FmNU5EVgluRRg9jonAqsUlwLMZnIisEr49LZ9fHPXL7s+3wnAhpkTgQ09lwLMZudEYEPLCcCsM04ENvC27Znmxh37eebwEc7ocDlHJwGzzjkR2EDbtmeaLd/bx5Gj9Smopg8fYcv39gG0TAZOAGbzl8c01GaFuXHH/teSQMORo8e4ccf+GcdmSQLCScCqyyUCG2jPHD4y53aXAsyycSKwgXbGwlGmWySDMxaOAh4YZpYHVw3ZQNu8dgWjC0aO2za6YITpw0ecBMxy4hKBDbRGg3C611CrEkKnnADMZspUIpB0sqS7JD2W/F7U4phVkn4i6WFJD0q6MrXva5KelLQ3+VmVJR4bTutXj3H/+AUEOAmYFSBriWAcuCciJiSNJ88/1XTMS8AHIuIxSWcAuyXtiIjDyf7NEXFnxjhsyLkayKw4WRPBOuD85PFtwH00JYKIeDT1+BlJzwGLgcMZ39sqwD2CzIqXNRGcFhHPJo9/BZw228GSzgVOBB5Pbf6cpL8C7gHGI+LlNuduAjYBLFmyJGPYNujedt0P+Zdj3a+n7QRg1rk5E4Gku4G3tNh1XfpJRISktv9zJZ0OfAPYGBGvJpu3UE8gJwJbqZcmrm91fkRsTY6hVqt1f4fok26mSagqlwLMemvORBARF7bbJ+nXkk6PiGeTG/1zbY57M7AduC4idqVeu1GaeFnSV4FPziv6kpjvNAlV5QRg1h9ZxxFMAhuTxxuBHzQfIOlE4PvA15sbhZPkgSQB64GHMsYzkOYzTUJVOQmY9U/WNoIJ4DuSrgV+AVwBIKkGfCgiPphsexdwiqRrkvOuiYi9wLckLaY+1cte4EMZ4xlInUyTUFVOAGb9lykRRMTzwB+12D4FfDB5/E3gm23OvyDL+5fFXNMkVFXWJCBgzcROt7eYZeQpJnqg3TQJm9eu6FNE/bV0fHumJNC4lo0BZlu+t49te6Zzis6sepwIemD96jFuuGwlYwtHETC2cJQbLltZyW+xWQeGjS0cdXuLWc4811CPrF89Vskbf0NebQFubzHLnxOBFS7P6SHc3mKWP1cNWWGytgW06hHk9haz/LlEUHFFjXjOkgDGFo5y/3jrDmWtpqV2ryGzbJwIKqyIEc9Z2wI6+XZf9fYWs7w5EVTYbCOe53ujfeuW7bySYQYogb/dm/WJE0GF5dUDx6ODzcrNiaDCsvbAcQIwGw7uNVRhWXrgOAmYDY9KlwiqvkZANz1wnADMhk9lE4HXCKibTw+cLEngBMGBG5wEzAZRZRNBnj1mhp1LAfNX9dKmlUtlE4HnrOlM3iODq8ClTSubTI3Fkk6WdJekx5Lfi9ocd0zS3uRnMrV9maSfSjog6Y5kNbOeaNczxnPW1BUxPURVeEU6K5usvYbGgXsiYjlwT/K8lSMRsSr5uTS1/fPATRHxVuBF4NqM8XTMc9a0lzUBVDkJgEubVj5ZE8E64Lbk8W3U1x3uSLJO8QVAYx3jeZ2fldcImMmlgHy4tGllk7WN4LSIeDZ5/CvgtDbHvVHSFPAKMBER24BTgMMR8UpyzEGg7V1Y0iZgE8CSJUsyhl3nOWte5wSQn81rVxzXRgAubdpgmzMRSLobeEuLXdeln0RESGo328zZETEt6Rxgp6R9wG/mE2hEbAW2AtRqtQyz2liaewTlzzOkWtnMmQgi4sJ2+yT9WtLpEfGspNOB59q8xnTy+wlJ9wGrgX8AFko6ISkVnAl44dkecQIolkubViZZ2wgmgY3J443AD5oPkLRI0knJ41OBNcAjERHAvcDls51v+XMSMLO0rG0EE8B3JF0L/AK4AkBSDfhQRHwQeDvwFUmvUk88ExHxSHL+p4DbJX0W2AP8fcZ4hk6eA5OcAMysFdW/mJdLrVaLqampfodRuOaBSVBvdOymd1NeScAjZs3KS9LuiKg1b6/syOIyyGMajKwJQMCTqSTgEbNmw8fTUA+wrAOTsiYBOL7vu0fMmg0nlwgGWLcLx+SRAGBm33ePmDUbTi4RDLDNa1ewYETHbVswolkHJmVJAl+8ctWsI609YtZsOLlEMOia2/LbtO3n1Rg8W12/R8yaDScnggF24479HH31+Dv/0VdjRmNxr6aH8IhZs+HkRDDA5qqT78e4AI+YNRs+TgQDrF1jsfAkcWaWHzcW52jbnmnWTOxk2fh21kzsZNuebFMntVozAeDVDK/pJGBmzVwiyEkRg63SdfKtSgbz4QRgZu24RJCTogZbrV895iRgZoVyiSAnRQy28iRxZtYLLhHkJO/BVk4CZtYrLhHkJK/BVk4AZtZrTgQ5yWOwlbuEmlk/OBHkqNvBVi4FmFk/ZUoEkk4G7gCWAk8BV0TEi03HvAe4KbXpbcCGiNgm6WvAu3l9IftrImJvlpjKxqUAM+u3rCWCceCeiJiQNJ48/1T6gIi4F1gFryWOA8D/Sh2yOSLuzBhHTwzyspFeOczMupU1EawDzk8e3wbcR1MiaHI58KOIeCnj+/ZcngPG8i4FtIrt43fsZeoXL/DZ9Ss7fm0nE7Nqytp99LSIeDZ5/CvgtDmO3wB8u2nb5yQ9KOkmSSe1O1HSJklTkqYOHTqUIeTu5DFgbOn49kKqglrFFsC3dv2y42kuGslk+vARgtcTXdZpMsxs8M2ZCCTdLemhFj/r0sdFRNB2tnyQdDqwEtiR2ryFepvBvwNOZpbSRERsjYhaRNQWL148V9i5yzJg7KIv3JcpAYwuGOGLV65qu79dDAEdJyovQ2lWXXNWDUXEhe32Sfq1pNMj4tnkRv/cLC91BfD9iDiaeu1GaeJlSV8FPtlh3D3Xz2Uj51qwvl1s0PnIZi9DaVZdWauGJoGNyeONwA9mOfYqmqqFkuSBJAHrgYcyxlOYVjOBzjZgLGs1ULPZbsib165AbfZ1OrLZy1CaVVfWRDABXCTpMeDC5DmSapJuaRwkaSlwFvC/m87/lqR9wD7gVOCzGeMpzPrVY9xw2cpZ1/RtyDMBNMx2Q16/eoz/+M4lM5LBfEY2zzfRmdnwUL1qv1xqtVpMTU31O4wZ8ugS2twDCOo35HZJJy1rrx/3GjIbbpJ2R0RtxnYngnzkOS7AN2QzK0K7ROApJjIqYnoIrwtsZr3kRNBGJ9/KPT2EmQ0DJ4IW5hpF3OtJ4lxVZGZFciJoYbbBVR+7Y2/Xr9tNKaCItZDNzNK8QlkL7frsZ1k7uNuqII/4NbOiuUTQwmwjdecra1uAR/yaWdGcCBLpeviFb1rAgjeIo69m61qbR4Nwt1NbmJl1yomAmfXwL750dI4zZpdnj6C81kLuNzd4mw2uSieCxs0pr2ogyL9baB5rIfebG7zNBltlE0GrqRyyKHJcQNkHmM3W4F3mf5fZsKhsr6FWN6dueXDY7NzgbTbYKlsiyOMm5IFhnXGDt1l7g3BfqEwi2LZnms9MPszhI9kaghvGFo6ybc90x3+wKteTD0uDt1neBuW+UImqoW17ptn83Z/llgRg/mv6Vnlg2HzWcjCrkkG5L1SiRHDjjv2ZxwSMtajemE+DZ9Xrycve4G1WhNlmMVgzsbNn1UWZSgSS/lTSw5JelTRjjuvUcZdI2i/pgKTx1PZlkn6abL9D0olZ4mkn681WtJ9eotPX9lKQr9u2Z5o1EztZNr6dNRM7Oy5VmQ2bdv//G/ecYP61D93IWjX0EHAZ8ON2B0gaAW4G3gu8A7hK0juS3Z8HboqItwIvAtdmjKelbssCjaUbZzu/0xu5l4Ksa9SJ9vJDbjaoWt0XxMx7TtHVRZkSQUT8PCLmiu5c4EBEPBER/wrcDqxLFqy/ALgzOe426gvYD4QRac7upfO5kbuevG5Q6kTNBkGr+0K7L55FViP3oo1gDHg69fwgcB5wCnA4Il5JbW97V5S0CdgEsGTJkmIiTTk2yxKegq7q7VxP7rYSs2bN94U1Ezt73t16zkQg6W7gLS12XRcRP8g/pNYiYiuwFeprFhf9fiNSy2QwtnCU+8cvKPrth5bHFJjNrh/dreesGoqICyPiD1r8dJoEpoGzUs/PTLY9DyyUdELT9p5rVXd/1XlnuU6/AG4rMZtdP6qRe1E19ACwXNIy6jf6DcCfRURIuhe4nHq7wUagkBLGUxPvb7u85FMT7287sq929sl9H/E3bIZhEj2zovW6GlkxS134nCdLfwz8d2AxcBjYGxFrJZ0B3BIR70uOex/wRWAEuDUiPpdsP4d6EjgZ2ANcHREvz/W+tVotpqamuo7bzKyKJO2OiBld/TMlgn5xIjAzm792iaASU0yYmVl7TgRmZhXnRGBmVnFOBGZmFVfKxmJJh4BfdHn6qcA/5xhOXhzX/Diu+XFc8zOscZ0dEYubN5YyEWQhaapVq3m/Oa75cVzz47jmp2pxuWrIzKzinAjMzCquiolga78DaMNxzY/jmh/HNT+ViqtybQRmZna8KpYIzMwsxYnAzKzihjIRSPpTSQ9LelVS265Wki6RtF/SAUnjqe3LJP002X6HpBNziutkSXdJeiz5vajFMe+RtDf18y+S1if7vibpydS+Vb2KKznuWOq9J1Pb+3m9Vkn6SfL3flDSlal9uV6vdp+X1P6Tkn//geR6LE3t25Js3y9pbZY4uojrE5IeSa7PPZLOTu1r+TftUVzXSDqUev8PpvZtTP7uj0na2OO4bkrF9Kikw6l9hVwvSbdKek7SQ232S9J/S2J+UNIfpvZlv1YRMXQ/wNuBFcB9QK3NMSPA48A5wInAz4B3JPu+A2xIHn8Z+HBOcf0tMJ48Hgc+P8fxJwMvAG9Knn8NuLyA69VRXMD/a7O9b9cL+D1gefL4DOBZYGHe12u2z0vqmP8CfDl5vAG4I3n8juT4k4BlyeuM9DCu96Q+Qx9uxDXb37RHcV0DfKnFuScDTyS/FyWPF/Uqrqbj/4L61PlFX693AX8IPNRm//uAH1FfKfedwE/zvFZDWSKIiJ9HxFyroZ8LHIiIJyLiX6mvi7BOkoALgDuT424D1ucU2rrk9Tp93cuBH0XESzm9fzvzjes1/b5eEfFoRDyWPH4GeI76+hh5a/l5mSXeO4E/Sq7POuD2iHg5Ip4EDiSv15O4IuLe1GdoF/XVAIvWyfVqZy1wV0S8EBEvAncBl/QprquAb+f03m1FxI+pf+lrZx3w9ajbRX11x9PJ6VoNZSLo0BjwdOr5wWTbKcDhiHilaXseTouIZ5PHvwJOm+P4Dcz8EH4uKRreJOmkHsf1RklTknY1qqsYoOsl6Vzq3/IeT23O63q1+7y0PCa5Hr+hfn06ObfIuNKupf7NsqHV37SXcf1J8ve5U1JjSduBuF5JFdoyYGdqc1HXay7t4s7lWvViqcpCSLobeEuLXddF5+sp5262uNJPIiIkte27m2T7lcCO1OYt1G+IJ1LvT/wp4PoexnV2REyrvrLcTkn7qN/supbz9foGsDEiXk02d329hpGkq4Ea8O7U5hl/04h4vPUr5O4fgW9HxMuS/jP10tQFPXrvTmwA7oyIY6lt/bxehSltIoiICzO+xDRwVur5mcm256kXu05IvtU1tmeOS9KvJZ0eEc8mN67nZnmpK4DvR8TR1Gs3vh2/LOmrwCd7GVdETCe/n5B0H7Aa+Af6fL0kvRnYTv1LwK7Ua3d9vVpo93lpdcxBSScAv0P989TJuUXGhaQLqSfXd0dqOdg2f9M8bmxzxhURz6ee3kK9Tahx7vlN596XQ0wdxZWyAfhIekOB12su7eLO5VpVuWroAWC56j1eTqT+R5+MegvMvdTr5wE2AnmVMCaT1+vkdWfUTSY3w0a9/HqgZQ+DIuKStKhRtSLpVGAN8Ei/r1fyt/s+9frTO5v25Xm9Wn5eZon3cmBncn0mgQ2q9ypaBiwH/ilDLPOKS9Jq4CvApRHxXGp7y79pD+M6PfX0UuDnyeMdwMVJfIuAizm+ZFxoXElsb6Pe+PqT1LYir9dcJoEPJL2H3gn8Jvmik8+1KqIFvN8/wB9Tryt7Gfg1sCPZfgbww9Rx7wMepZ7Rr0ttP4f6f9QDwHeBk3KK6xTgHuAx4G7g5GR7DbglddxS6pn+DU3n7wT2Ub+hfRP47V7FBfz75L1/lvy+dhCuF3A1cBTYm/pZVcT1avV5oV7VdGny+I3Jv/9Acj3OSZ17XXLefuC9OX/e54rr7uT/QeP6TM71N+1RXDcADyfvfy/wttS5/ym5jgeAP+9lXMnzzwATTecVdr2of+l7NvksH6TelvMh4EPJfgE3JzHvI9UbMo9r5SkmzMwqrspVQ2ZmhhOBmVnlORGYmVWcE4GZWcU5EZiZVZwTgZlZxTkRmJlV3P8HgKVZJniedHAAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==1\n",
+    "plt.scatter(vals_a[msk], vals_b[msk])"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index b1d4016f3..6609a568a 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -23,7 +23,7 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 10.0
+  classification_loss_coef: 0.01
   charge_loss_coef: 0.1
   pt_loss_coef: 1.0
   eta_loss_coef: 0.1
@@ -34,6 +34,8 @@ dataset:
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+  energy_loss: MeanSquaredLogarithmicError
+  pt_loss: MeanSquaredLogarithmicError
 
 tensorflow:
   eager: no
@@ -42,11 +44,11 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 5e-4
-  batch_size: 5
+  lr: 1e-5
+  batch_size: 2
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 500
+  num_epochs: 50
   num_val_files: 10
   dtype: float32
   trainable: all
@@ -65,8 +67,8 @@ sample_weights:
 parameters:
   model: gnn_dense
   activation: elu
-  layernorm: no
-  hidden_dim: 256
+  layernorm: yes
+  hidden_dim: 512
   bin_size: 40
   clip_value_low: 0.00
   num_conv: 2

From 71b80ffdbe0015530979f670f108def834be7b89 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 18 Aug 2021 10:26:38 +0300
Subject: [PATCH 047/157] up

---
 mlpf/tfmodel/model.py             |  6 +++++-
 mlpf/tfmodel/model_setup.py       |  2 +-
 mlpf/tfmodel/utils.py             |  2 +-
 parameters/cms-gnn-dense-dev.yaml | 26 +++++++++++++-------------
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index a04ab1f70..a61ce883f 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -10,7 +10,7 @@
 import numpy as np
 from numpy.lib.recfunctions import append_fields
 
-regularizer_weight = 1e-9
+regularizer_weight = 0.0
 
 def split_indices_to_bins(cmul, nbins, bin_size):
     bin_idx = tf.argmax(cmul, axis=-1)
@@ -976,6 +976,8 @@ def __init__(self,
         self.ffn_enc_id = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_id")
         self.ffn_enc_reg = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_reg")
 
+        self.momentum_mult = self.add_weight(shape=(num_momentum_outputs, ), initializer=tf.keras.initializers.Ones(), name="momentum_multiplication")
+
         kwargs_cg = {
             "output_dim": dff,
             "max_num_bins": max_num_bins,
@@ -1075,6 +1077,8 @@ def call(self, inputs, training=False):
         else:
             pred_momentum = self.ffn_momentum(dec_output_reg)*msk_input
 
+        pred_momentum = self.momentum_mult*pred_momentum
+
         out_charge = tf.clip_by_value(out_charge, -2, 2)
 
         ret = {
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 1855bcf6a..9aaaaceb9 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -91,7 +91,7 @@ def __init__(self, outpath, X, y, dataset_transform, num_output_classes, freq=1)
             "eta": np.linspace(-5, 5, 100),
             "sin_phi": np.linspace(-1,1,100),
             "cos_phi": np.linspace(-1,1,100),
-            "energy": np.linspace(0,100,100),
+            "energy": np.linspace(0,1000,100),
         }
 
     def plot_cm(self, outpath, ypred_id, msk):
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 909e8c2f3..a10e1cbc8 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -152,7 +152,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 6609a568a..8e82f5744 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -23,19 +23,19 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 0.01
-  charge_loss_coef: 0.1
+  classification_loss_coef: 1.0
+  charge_loss_coef: 1.0
   pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
+  eta_loss_coef: 1.0
   sin_phi_loss_coef: 1.0
   cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.01
+  energy_loss_coef: 1.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-  energy_loss: MeanSquaredLogarithmicError
-  pt_loss: MeanSquaredLogarithmicError
+  energy_loss: Huber
+  pt_loss: Huber
 
 tensorflow:
   eager: no
@@ -44,11 +44,11 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-5
+  lr: 1e-3
   batch_size: 2
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 50
+  num_epochs: 100
   num_val_files: 10
   dtype: float32
   trainable: all
@@ -65,17 +65,17 @@ sample_weights:
   energy: signal_only
 
 parameters:
-  model: gnn_dense
+  model: gnn_dense  
   activation: elu
   layernorm: yes
-  hidden_dim: 512
+  hidden_dim: 128
   bin_size: 40
-  clip_value_low: 0.00
+  clip_value_low: 0.0
   num_conv: 2
-  num_gsl: 2
+  num_gsl: 5
   normalize_degrees: yes
   distance_dim: 16
-  dropout: 0.0
+  dropout: 0.1
   separate_momentum: yes
   input_encoding: cms
   graph_kernel: learnable #gaussian, learnable

From 72d8ee494863527b796a61b9eed7912815d5c975 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 18 Aug 2021 12:08:30 +0300
Subject: [PATCH 048/157] fix multi gpu training

---
 mlpf/pipeline.py                  | 63 ++++++++++++++++---------------
 mlpf/tfmodel/utils.py             | 20 +++++++---
 parameters/cms-gnn-dense-dev.yaml | 12 ++++--
 3 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ad14b6786..cda0eaae4 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -72,6 +72,12 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         config, ntrain, ntest, weights
     )
 
+    # Decide tf.distribute.strategy depending on number of available GPUs
+    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+    # If using more than 1 GPU, we scale the batch size by the number of GPUs before the dataset is loaded
+    if maybe_global_batch_size is not None:
+        global_batch_size = maybe_global_batch_size
+
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
     X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
@@ -82,11 +88,6 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         outdir = str(Path(weights).parent)
     shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
-    # Decide tf.distribute.strategy depending on number of available GPUs
-    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
-    # If using more than 1 GPU, we scale the batch size by the number of GPUs
-    if maybe_global_batch_size is not None:
-        global_batch_size = maybe_global_batch_size
     total_steps = n_epochs * n_train // global_batch_size
     lr = float(config["setup"]["lr"])
 
@@ -150,32 +151,32 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         )
         callbacks.append(optim_callbacks)
 
-        fit_result = model.fit(
-            ds_train_r,
-            validation_data=ds_test_r,
-            epochs=initial_epoch + n_epochs,
-            callbacks=callbacks,
-            steps_per_epoch=n_train // global_batch_size,
-            validation_steps=n_test // global_batch_size,
-            initial_epoch=initial_epoch,
-        )
-        history_path = Path(outdir) / "history"
-        history_path = str(history_path)
-        with open("{}/history.json".format(history_path), "w") as fi:
-            json.dump(fit_result.history, fi)
-        model.save(outdir + "/model_full", save_format="tf")
-
-        print("Training done.")
-
-        print("Starting evaluation...")
-        eval_dir = Path(outdir) / "evaluation"
-        eval_dir.mkdir()
-        eval_dir = str(eval_dir)
-        # TODO: change to use the evaluate() function below instead of eval_model()
-        eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
-        print("Evaluation done.")
-
-        freeze_model(model, config, outdir)
+    fit_result = model.fit(
+        ds_train_r,
+        validation_data=ds_test_r,
+        epochs=initial_epoch + n_epochs,
+        callbacks=callbacks,
+        steps_per_epoch=n_train // global_batch_size,
+        validation_steps=n_test // global_batch_size,
+        initial_epoch=initial_epoch,
+    )
+    history_path = Path(outdir) / "history"
+    history_path = str(history_path)
+    with open("{}/history.json".format(history_path), "w") as fi:
+        json.dump(fit_result.history, fi)
+    model.save(outdir + "/model_full", save_format="tf")
+
+    print("Training done.")
+
+    print("Starting evaluation...")
+    eval_dir = Path(outdir) / "evaluation"
+    eval_dir.mkdir()
+    eval_dir = str(eval_dir)
+    # TODO: change to use the evaluate() function below instead of eval_model()
+    eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
+    print("Evaluation done.")
+
+    freeze_model(model, config, outdir)
 
 
 @main.command()
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index a10e1cbc8..3e390eb2e 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -311,16 +311,24 @@ def get_class_loss(config):
     return cls_loss
 
 
+def get_loss_from_params(input_dict):
+    input_dict = input_dict.copy()
+    loss_type = input_dict.pop("type")
+    loss_cls = getattr(tf.keras.losses, loss_type)
+    return loss_cls(**input_dict)
+
 def get_loss_dict(config):
     cls_loss = get_class_loss(config)
+
+    default_loss = {"type": "MeanSquaredError"}
     loss_dict = {
         "cls": cls_loss,
-        "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
-        "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
-        "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
-        "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
-        "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
-        "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
+        "charge": get_loss_from_params(config["dataset"].get("charge_loss", default_loss)),
+        "pt": get_loss_from_params(config["dataset"].get("pt_loss", default_loss)),
+        "eta": get_loss_from_params(config["dataset"].get("eta_loss", default_loss)),
+        "sin_phi": get_loss_from_params(config["dataset"].get("sin_phi_loss", default_loss)),
+        "cos_phi": get_loss_from_params(config["dataset"].get("cos_phi_loss", default_loss)),
+        "energy": get_loss_from_params(config["dataset"].get("energy_loss", default_loss)),
     }
     loss_weights = {
         "cls": config["dataset"]["classification_loss_coef"],
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 8e82f5744..8ee75c77e 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -29,13 +29,17 @@ dataset:
   eta_loss_coef: 1.0
   sin_phi_loss_coef: 1.0
   cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
+  energy_loss_coef: 0.1
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-  energy_loss: Huber
-  pt_loss: Huber
+  energy_loss:
+    type: Huber
+    delta: 10.0
+  pt_loss:
+    type: Huber
+    delta: 10.0
 
 tensorflow:
   eager: no
@@ -44,7 +48,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 2
   num_events_train: 80000
   num_events_test: 10000

From 3a1af281f634ddcf5485b635376bd4d1f80e29fa Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 13:53:33 +0300
Subject: [PATCH 049/157] remove outdated code

---
 mlpf/pipeline.py                   | 106 ++++-
 mlpf/tfmodel/model.py              | 648 ++---------------------------
 mlpf/tfmodel/model_setup.py        |  64 +--
 mlpf/tfmodel/utils.py              |  13 +-
 notebooks/pfnet-debug.ipynb        | 230 ++++++----
 parameters/cms-gnn-dense-dev.yaml  |  18 +-
 parameters/test-cms-v2.yaml        |   6 +-
 parameters/test-cms.yaml           |  77 ----
 scripts/local_test_cms_pipeline.sh |  13 +-
 9 files changed, 330 insertions(+), 845 deletions(-)
 delete mode 100644 parameters/test-cms.yaml

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index cda0eaae4..6cc03090c 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -141,15 +141,15 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         )
         model.summary()
 
-        callbacks = prepare_callbacks(
-            model,
-            outdir,
-            X_val,
-            ycand_val,
-            dataset_transform,
-            config["dataset"]["num_output_classes"],
-        )
-        callbacks.append(optim_callbacks)
+    callbacks = prepare_callbacks(
+        model,
+        outdir,
+        X_val,
+        ycand_val,
+        dataset_transform,
+        config["dataset"]["num_output_classes"],
+    )
+    callbacks.append(optim_callbacks)
 
     fit_result = model.fit(
         ds_train_r,
@@ -160,6 +160,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         validation_steps=n_test // global_batch_size,
         initial_epoch=initial_epoch,
     )
+
     history_path = Path(outdir) / "history"
     history_path = str(history_path)
     with open("{}/history.json".format(history_path), "w") as fi:
@@ -179,6 +180,93 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     freeze_model(model, config, outdir)
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
+@click.option("--ntrain", default=None, help="override the number of training events", type=int)
+@click.option("--ntest", default=None, help="override the number of testing events", type=int)
+@click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
+@click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
+def train_reg(config, weights, ntrain, ntest, recreate, prefix):
+    config_file_path = config
+    config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
+        config, ntrain, ntest, weights
+    )
+
+    if recreate or (weights is None):
+        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
+    else:
+        outdir = str(Path(weights).parent)
+    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
+
+    total_steps = n_epochs * n_train // global_batch_size
+    lr = float(config["setup"]["lr"])
+
+    dataset_def = get_dataset_def(config)
+    ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test, repeat=False)
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
+
+    opt = tf.keras.optimizers.Adam(learning_rate=lr)
+
+    if config["setup"]["dtype"] == "float16":
+        model_dtype = tf.dtypes.float16
+        policy = mixed_precision.Policy("mixed_float16")
+        mixed_precision.set_global_policy(policy)
+        opt = mixed_precision.LossScaleOptimizer(opt)
+    else:
+        model_dtype = tf.dtypes.float32
+
+    model = make_model(config, model_dtype)
+    model(tf.cast(X_val[:1], model_dtype))
+
+    initial_epoch = 0
+    if weights:
+        # We need to load the weights in the same trainable configuration as the model was set up
+        configure_model_weights(model, config["setup"].get("weights_config", "all"))
+        model.load_weights(weights, by_name=True)
+        initial_epoch = int(weights.split("/")[-1].split("-")[1])
+    model(tf.cast(X_val[:1], model_dtype))
+
+    callbacks = prepare_callbacks(
+            model,
+            outdir,
+            X_val,
+            ycand_val,
+            dataset_transform,
+            config["dataset"]["num_output_classes"],
+        )
+
+    configure_model_weights(model, "regression")
+    for epoch in range(initial_epoch, initial_epoch+n_epochs):
+
+        loss_vals = []
+        for step, (xb, yb, wb) in tqdm(enumerate(ds_train_r)):
+            with tf.GradientTape() as tape:
+                res = model(xb, training=True)
+
+                cls_true = tf.argmax(yb["cls"], axis=-1)
+                cls_pred = tf.argmax(res["cls"], axis=-1)
+                msk_x = xb[:, :, 0]!=0
+                msk_correct = msk_x & (cls_true==cls_pred) & (cls_true==3)
+
+                msk_correct_f = tf.expand_dims(tf.cast(msk_correct, tf.float32), axis=-1)
+
+                loss_value = tf.keras.losses.mean_squared_error(tf.math.log(yb["energy"]*msk_correct_f + 1.0), tf.math.log(res["energy"]*msk_correct_f + 1.0))
+                #loss_value = loss_value + tf.keras.losses.huber(yb["pt"]*msk_correct_f, res["pt"]*msk_correct_f, delta=5.0)
+                #loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["eta"]*msk_correct_f, res["eta"]*msk_correct_f)
+                #loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["sin_phi"]*msk_correct_f, res["sin_phi"]*msk_correct_f)
+                #loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["cos_phi"]*msk_correct_f, res["cos_phi"]*msk_correct_f)
+                loss_value = tf.reduce_mean(loss_value)
+                loss_vals.append(loss_value.numpy())
+                #import pdb;pdb.set_trace()
+
+            grads = tape.gradient(loss_value, model.trainable_weights)
+            opt.apply_gradients(zip(grads, model.trainable_weights))
+
+        print(epoch, np.mean(loss_vals))
+        callbacks[-1].on_epoch_end(epoch, logs={})
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-t", "--train_dir", required=True, help="directory containing a completed training", type=click.Path())
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index a61ce883f..cc46559fb 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -49,7 +49,7 @@ def pairwise_learnable_dist(A, B, ffn):
     ) #(batch, bin, elem, elem, feat)
 
     #run a feedforward net on (src, dst) -> 1
-    res_transformed = tf.squeeze(ffn(res), axis=-1)
+    res_transformed = ffn(res)
 
     return res_transformed
 
@@ -170,40 +170,6 @@ def call(self, X):
 
 #https://arxiv.org/pdf/2004.04635.pdf
 #https://github.com/gcucurull/jax-ghnet/blob/master/models.py 
-class GHConv(tf.keras.layers.Layer):
-    def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
-
-        super(GHConv, self).__init__(*args, **kwargs)
-
-    def build(self, input_shape):
-        self.hidden_dim = input_shape[0][-1]
-        self.nelem = input_shape[0][-2]
-        self.W_t = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.b_t = self.add_weight(shape=(self.hidden_dim,), name="b_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.W_h = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_h", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.theta = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="theta", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
- 
-    #@tf.function
-    def call(self, inputs):
-        x, adj = inputs
-
-        #compute the normalization of the adjacency matrix
-        in_degrees = tf.sparse.reduce_sum(tf.abs(adj), axis=-1)
-        in_degrees = tf.reshape(in_degrees, (tf.shape(x)[0], tf.shape(x)[1]))
-
-        #add epsilon to prevent numerical issues from 1/sqrt(x)
-        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
-
-        f_hom = tf.linalg.matmul(x, self.theta)
-        f_hom = sparse_dense_matmult_batch(adj, f_hom*norm)*norm
-
-        f_het = tf.linalg.matmul(x, self.W_h)
-        gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
-
-        out = gate*f_hom + (1-gate)*f_het
-        return self.activation(out)
-
 class GHConvDense(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         self.activation = kwargs.pop("activation")
@@ -242,34 +208,22 @@ def call(self, inputs):
         out = gate*f_hom + (1.0-gate)*f_het
         return self.activation(out)*msk
 
-class SGConv(tf.keras.layers.Layer):
+class MPNNNodeFunction(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
-        self.k = kwargs.pop("k")
-        super(SGConv, self).__init__(*args, **kwargs)
-    
-    def build(self, input_shape):
-        hidden_dim = input_shape[0][-1]
-        self.W = self.add_weight(shape=(hidden_dim, hidden_dim), name="w", initializer="random_normal", trainable=True)
-        self.b = self.add_weight(shape=(hidden_dim,), name="b", initializer="random_normal", trainable=True)
 
-    #@tf.function
-    def call(self, inputs):
-        x, adj = inputs
-        #compute the normalization of the adjacency matrix
-        in_degrees = tf.sparse.reduce_sum(tf.abs(adj), axis=-1)
-
-        #add epsilon to prevent numerical issues from 1/sqrt(x)
-        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
-        norm_k = tf.pow(norm, self.k)
+        self.output_dim = kwargs.pop("output_dim")
+        self.hidden_dim = kwargs.pop("hidden_dim")
+        self.num_layers = kwargs.pop("num_layers")
+        self.activation = kwargs.pop("activation")
 
-        support = tf.linalg.matmul(x, self.W)
-     
-        #k-th power of the normalized adjacency matrix is nearly equivalent to k consecutive GCN layers
-        #adj_k = tf.pow(adj, self.k)
-        out = sparse_dense_matmult_batch(adj, support*norm)*norm
+        self.ffn = point_wise_feed_forward_network(self.output_dim, self.hidden_dim, num_layers=self.num_layers, activation=self.activation)
+        super(MPNNNodeFunction, self).__init__(*args, **kwargs)
 
-        return self.activation(out + self.b)
+    def call(self, inputs):
+        x, adj, msk = inputs
+        avg_message = tf.reduce_mean(adj, axis=-2)
+        x2 = tf.concat([x, avg_message], axis=-1)*msk
+        return self.ffn(x2)
 
 def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None):
     bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
@@ -280,142 +234,16 @@ def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu'
         name=name
     )
 
-class SparseHashedNNDistance(tf.keras.layers.Layer):
-    def __init__(self, distance_dim=128, max_num_bins=200, bin_size=500, num_neighbors=5, dist_mult=0.1, **kwargs):
-        super(SparseHashedNNDistance, self).__init__(**kwargs)
-        self.num_neighbors = tf.constant(num_neighbors)
-        self.dist_mult = dist_mult
-        self.distance_dim = distance_dim
-
-        #generate the codebook for LSH hashing at model instantiation for up to this many bins
-        #set this to a high-enough value at model generation to take into account the largest possible input 
-        self.max_num_bins = tf.constant(max_num_bins)
-
-        #each bin will receive this many input elements, in total we can accept max_num_bins*bin_size input elements
-        #in each bin, we will do a dense top_k evaluation
-        self.bin_size = bin_size
-        self.layer_encoding = point_wise_feed_forward_network(distance_dim, 128)
-        self.layer_edge = point_wise_feed_forward_network(1, 128)
-
-    def build(self, input_shape):
-        #(n_batch, n_points, n_features)
-
-        #generate the LSH codebook for random rotations (num_features, max_num_bins/2)
-        self.codebook_random_rotations = self.add_weight(
-            shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal", trainable=False, name="lsh_projections"
-        )
-
-    #@tf.function
-    def call(self, inputs, training=True):
-
-        #(n_batch, n_points, n_features)
-        point_embedding = self.layer_encoding(inputs)
-        
-        n_batches = tf.shape(point_embedding)[0]
-        n_points = tf.shape(point_embedding)[1]
-        #points_neighbors = n_points * self.num_neighbors
-
-        #cannot concat sparse tensors directly as that incorrectly destroys the gradient, see
-        #https://github.com/tensorflow/tensorflow/blob/df3a3375941b9e920667acfe72fb4c33a8f45503/tensorflow/python/ops/sparse_grad.py#L33
-        def func(args):
-            ibatch, points_batch = args[0], args[1]
-            bins_split, (inds, vals) = self.construct_sparse_dm_batch(points_batch)
-            inds = tf.concat([tf.expand_dims(tf.cast(ibatch, tf.int64)*tf.ones(tf.shape(inds)[0], dtype=tf.int64), -1), inds], axis=-1)
-            return inds, vals, bins_split
-
-        elems = (tf.range(0, n_batches, delta=1, dtype=tf.int64), point_embedding)
-        ret = tf.map_fn(func, elems,
-            fn_output_signature=(
-                tf.TensorSpec((None, 3), tf.int64),
-                tf.TensorSpec((None, ), inputs.dtype),
-                tf.TensorSpec((None, self.bin_size), tf.int32),
-            ),
-            parallel_iterations=2, back_prop=True
-        )
-
-        # #now create a new SparseTensor that is a concatenation of the per-batch tensor indices and values
-        shp = tf.shape(ret[0])
-        dms = tf.SparseTensor(
-            tf.reshape(ret[0], (shp[0]*shp[1], shp[2])),
-            tf.reshape(ret[1], (shp[0]*shp[1],)),
-            (n_batches, n_points, n_points)
-        )
-
-        dm = tf.sparse.reorder(dms)
-
-        i1 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 1]]))
-        i2 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 2]]))
-        x1 = tf.gather_nd(inputs, i1)
-        x2 = tf.gather_nd(inputs, i2)
-
-        #run an edge net on (src node, dst node, edge)
-        edge_vals = tf.nn.sigmoid(self.layer_edge(tf.concat([x1, x2, tf.expand_dims(dm.values, axis=-1)], axis=-1)))
-        dm2 = tf.sparse.SparseTensor(indices=dm.indices, values=edge_vals[:, 0], dense_shape=dm.dense_shape)
-
-        return dm2, ret[2]
-
-    #@tf.function
-    def subpoints_to_sparse_matrix(self, subindices, subpoints):
-
-        #find the distance matrix between the given points in all the LSH bins
-        dm = pairwise_gaussian_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
-        dm = tf.exp(-self.dist_mult*dm)
-
-        #dm = pairwise_sigmoid_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
-
-        dmshape = tf.shape(dm)
-        nbins = dmshape[0]
-        nelems = dmshape[1]
-
-        #run KNN in the dense distance matrix, accumulate each index pair into a sparse distance matrix
-        top_k = tf.nn.top_k(dm, k=self.num_neighbors)
-        top_k_vals = tf.reshape(top_k.values, (nbins*nelems, self.num_neighbors))
-
-        indices_gathered = tf.map_fn(
-            lambda i: tf.gather_nd(subindices, top_k.indices[:, :, i:i+1], batch_dims=1),
-            tf.range(self.num_neighbors, dtype=tf.int32), fn_output_signature=tf.TensorSpec(None, tf.int32)
-        )
-        indices_gathered = tf.transpose(indices_gathered, [1,2,0])
+def get_conv_layer(config_dict):
+    config_dict = config_dict.copy()
+    class_name = config_dict.pop("type")
+    classes = {
+        "MPNNNodeFunction": MPNNNodeFunction,
+        "GHConvDense": GHConvDense
+    }
+    conv_cls = classes[class_name]
 
-        def func(i):
-           dst_ind = indices_gathered[:, :, i] #(nbins, nelems)
-           dst_ind = tf.reshape(dst_ind, (nbins*nelems, ))
-           src_ind = tf.reshape(tf.stack(subindices), (nbins*nelems, ))
-           src_dst_inds = tf.cast(tf.transpose(tf.stack([src_ind, dst_ind])), dtype=tf.int64)
-           return src_dst_inds, top_k_vals[:, i]
-
-        ret = tf.map_fn(func, tf.range(0, self.num_neighbors, delta=1, dtype=tf.int32), fn_output_signature=(tf.int64, subpoints.dtype))
-        
-        shp = tf.shape(ret[0])
-        inds = tf.reshape(ret[0], (shp[0]*shp[1], 2))
-        vals = tf.reshape(ret[1], (shp[0]*shp[1],))
-        return inds, vals
-
-    def construct_sparse_dm_batch(self, points):
-        #points: (n_points, n_features) input elements for graph construction
-        n_points = tf.shape(points)[0]
-        n_features = tf.shape(points)[1]
-
-        #compute the number of LSH bins to divide the input points into on the fly
-        #n_points must be divisible by bin_size exactly due to the use of reshape
-        n_bins = tf.math.floordiv(n_points, self.bin_size)
-
-        #put each input item into a bin defined by the softmax output across the LSH embedding
-        mul = tf.linalg.matmul(points, self.codebook_random_rotations[:, :n_bins//2])
-        cmul = tf.concat([mul, -mul], axis=-1)
-
-        #cmul is now an integer in [0..nbins) for each input point
-        #bins_split: (n_bins, bin_size) of integer bin indices, which puts each input point into a bin of size (n_points/n_bins)
-        bins_split = split_indices_to_bins(cmul, n_bins, self.bin_size)
-
-        #parts: (n_bins, bin_size, n_features), the input points divided up into bins
-        parts = tf.gather(points, bins_split)
-
-        #sparse_distance_matrix: (n_points, n_points) sparse distance matrix
-        #where higher values (closer to 1) are associated with points that are closely related
-        sparse_distance_matrix = self.subpoints_to_sparse_matrix(bins_split, parts)
-
-        return bins_split, sparse_distance_matrix
+    return conv_cls(**config_dict)
 
 class GraphBuilderDense(tf.keras.layers.Layer):
     def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_size=128, dist_mult=0.1, **kwargs):
@@ -428,7 +256,7 @@ def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_s
         self.kernel = kwargs.pop("kernel")
 
         if self.kernel == "learnable":
-            self.ffn_dist = point_wise_feed_forward_network(1, 32, num_layers=2, activation="elu")
+            self.ffn_dist = point_wise_feed_forward_network(32, 32, num_layers=2, activation="elu")
         elif self.kernel == "gaussian":
             pass
 
@@ -463,419 +291,19 @@ def call(self, x_dist, x_features, msk):
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
         if self.kernel == "learnable":
-            dm = tf.keras.activations.relu(pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist))
+            dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist)
+            dm = tf.keras.activations.elu(dm)
         elif self.kernel == "gaussian":
             dm = pairwise_gaussian_dist(x_dist_binned, x_dist_binned)
-        
-        dm = tf.exp(-self.dist_mult*dm)
+            dm = tf.exp(-self.dist_mult*dm)
+            dm = tf.clip_by_value(dm, self.clip_value_low, 1)
 
-        #set the distance matrix to 0 for masked elements
-        dm *= msk_f_binned
-        shp = tf.shape(msk_f_binned)
-        dm *= tf.reshape(msk_f_binned, (shp[0], shp[1], shp[3], shp[2]))
-
-        dm = tf.clip_by_value(dm, self.clip_value_low, 1)
+        #multiply the distance matrix row-wise and column-wise by the mask
+        dm = tf.einsum("abijk,abi->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
+        dm = tf.einsum("abijk,abj->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
 
         return bins_split, x_features_binned, dm, msk_f_binned
 
-class EncoderDecoderGNN(tf.keras.layers.Layer):
-    def __init__(self, encoders, decoders, dropout, activation, conv, **kwargs):
-        super(EncoderDecoderGNN, self).__init__(**kwargs)
-        name = kwargs.get("name")
-
-        #assert(encoders[-1] == decoders[0])
-        self.encoders = encoders
-        self.decoders = decoders
-
-        self.encoding_layers = []
-        for ilayer, nunits in enumerate(encoders):
-            self.encoding_layers.append(
-                tf.keras.layers.Dense(nunits, activation=activation,
-                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    name="encoding_{}_{}".format(name, ilayer)))
-            if dropout > 0.0:
-                self.encoding_layers.append(tf.keras.layers.Dropout(dropout))
-
-        self.conv = conv
-
-        self.decoding_layers = []
-        for ilayer, nunits in enumerate(decoders):
-            self.decoding_layers.append(
-                tf.keras.layers.Dense(nunits, activation=activation,
-                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    name="decoding_{}_{}".format(name, ilayer)))
-            if dropout > 0.0:
-                self.decoding_layers.append(tf.keras.layers.Dropout(dropout))
-
-    @tf.function
-    def call(self, inputs, distance_matrix, training=True):
-        x = inputs
-
-        for layer in self.encoding_layers:
-            x = layer(x)
-
-        for convlayer in self.conv:
-            x = convlayer([x, distance_matrix])
-
-        for layer in self.decoding_layers:
-            x = layer(x)
-
-        return x
-
-class AddSparse(tf.keras.layers.Layer):
-    def __init__(self, **kwargs):
-        super(AddSparse, self).__init__(**kwargs)
-
-    def call(self, matrices):
-        ret = matrices[0]
-        for mat in matrices[1:]:
-            ret = tf.sparse.add(ret, mat)
-        return ret
-
-#Simple message passing based on a matrix multiplication
-class PFNet(tf.keras.Model):
-    def __init__(self,
-        multi_output=False,
-        num_input_classes=8,
-        num_output_classes=3,
-        num_momentum_outputs=3,
-        activation=tf.nn.selu,
-        hidden_dim_id=256,
-        hidden_dim_reg=256,
-        distance_dim=256,
-        convlayer="ghconv",
-        dropout=0.1,
-        bin_size=10,
-        num_convs_id=1,
-        num_convs_reg=1,
-        num_hidden_id_enc=1,
-        num_hidden_id_dec=1,
-        num_hidden_reg_enc=1,
-        num_hidden_reg_dec=1,
-        num_neighbors=5,
-        dist_mult=0.1,
-        skip_connection=False,
-        return_matrix=False):
-
-        super(PFNet, self).__init__()
-        self.activation = activation
-        self.num_dists = 1
-        self.num_momentum_outputs = num_momentum_outputs
-        self.skip_connection = skip_connection
-        self.multi_output = multi_output
-        self.return_matrix = return_matrix
-
-        encoding_id = []
-        decoding_id = []
-        encoding_reg = []
-        decoding_reg = []
-
-        #the encoder outputs and decoder inputs have to have the hidden dim (convlayer size)
-        for ihidden in range(num_hidden_id_enc):
-            encoding_id.append(hidden_dim_id)
-
-        for ihidden in range(num_hidden_id_dec):
-            decoding_id.append(hidden_dim_id)
-
-        for ihidden in range(num_hidden_reg_enc):
-            encoding_reg.append(hidden_dim_reg)
-
-        for ihidden in range(num_hidden_reg_dec):
-            decoding_reg.append(hidden_dim_reg)
-
-        self.enc = InputEncoding(num_input_classes)
-        #self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dist = SparseHashedNNDistance(distance_dim=distance_dim, bin_size=bin_size, num_neighbors=num_neighbors, dist_mult=dist_mult)
-
-        convs_id = []
-        convs_reg = []
-        if convlayer == "sgconv":
-            for iconv in range(num_convs_id):
-                convs_id.append(SGConv(k=1, activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_convs_reg):
-                convs_reg.append(SGConv(k=1, activation=activation, name="conv_reg{}".format(iconv)))
-        elif convlayer == "ghconv":
-            for iconv in range(num_convs_id):
-                convs_id.append(GHConv(activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_convs_reg):
-                convs_reg.append(GHConv(activation=activation, name="conv_reg{}".format(iconv)))
-
-        self.gnn_id = EncoderDecoderGNN(encoding_id, decoding_id, dropout, activation, convs_id, name="gnn_id")
-        self.layer_id = point_wise_feed_forward_network(num_output_classes, hidden_dim_id, num_layers=3, activation=activation)
-        self.layer_charge = point_wise_feed_forward_network(1, hidden_dim_id, num_layers=3, activation=activation)
-        
-        self.gnn_reg = EncoderDecoderGNN(encoding_reg, decoding_reg, dropout, activation, convs_reg, name="gnn_reg")
-        self.layer_momentum = point_wise_feed_forward_network(num_momentum_outputs, hidden_dim_reg, num_layers=3, activation=activation)
-
-    # def create_model(self, num_max_elems, num_input_features, training=True):
-    #     inputs = tf.keras.Input(shape=(num_max_elems, num_input_features,))
-    #     return tf.keras.Model(inputs=[inputs], outputs=self.call(inputs, training), name="MLPFNet")
-
-    def call(self, inputs, training=True):
-        X = inputs
-        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, tf.dtypes.float32), -1)
-
-        enc = self.enc(inputs)
-
-        #create a graph structure from the encoded nodes
-        dm, bins = self.dist(enc, training)
-
-        #run graph net for multiclass id prediction
-        x_id = self.gnn_id(enc, dm, training)
-        
-        if self.skip_connection:
-            to_decode = tf.concat([enc, x_id], axis=-1)
-        else:
-            to_decode = tf.concat([x_id], axis=-1)
-
-        out_id_logits = self.layer_id(to_decode)*msk_input
-        out_charge = self.layer_charge(to_decode)*msk_input
-
-        #run graph net for regression output prediction, taking as an additonal input the ID predictions
-        x_reg = self.gnn_reg(tf.concat([enc, tf.cast(out_id_logits, X.dtype)], axis=-1), dm, training)
-
-        if self.skip_connection:
-            to_decode = tf.concat([enc, tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
-        else:
-            to_decode = tf.concat([tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
-
-        pred_momentum = self.layer_momentum(to_decode)*msk_input
-
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
-
-        if self.multi_output:
-            ret = {
-                "cls": out_id_softmax,
-                "charge": out_charge,
-                "pt": tf.exp(tf.clip_by_value(pred_momentum[:, :, 0:1], -4, 4)),
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": tf.exp(tf.clip_by_value(pred_momentum[:, :, 4:5], -5, 6))
-            }
-            if self.return_matrix:
-                ret["dm"] = dm
-                ret["bins"] = bins
-            return ret
-        else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
-
-    def set_trainable_classification(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.gnn_id.trainable = True
-        self.layer_id.trainable = True
-
-    def set_trainable_regression(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.gnn_reg.trainable = True
-        self.layer_momentum.trainable = True
-
-
-
-#Transformer code from the TF example
-class EncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1, support=8, dtype=tf.dtypes.float32):
-        super(EncoderLayer, self).__init__()
-
-        self.mha = SelfAttention(d_model, num_heads, rate, projection_matrix_type=True, nb_random_features=support)
-        self.ffn = point_wise_feed_forward_network(d_model, dff)
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, training):
-
-        attn_output = self.mha(x, None, training=training)    # (batch_size, input_seq_len, d_model)
-        attn_output = self.dropout1(attn_output, training=training)
-        out1 = self.layernorm1(x + attn_output)    # (batch_size, input_seq_len, d_model)
-
-        ffn_output = self.ffn(out1)    # (batch_size, input_seq_len, d_model)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        out2 = self.layernorm2(out1 + ffn_output)    # (batch_size, input_seq_len, d_model)
-
-        return out2
-
-class DecoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1, support=8, dtype=tf.dtypes.float32):
-        super(DecoderLayer, self).__init__()
-
-        self.mha1 = SelfAttention(d_model, num_heads, rate, projection_matrix_type=True, nb_random_features=support)
-        self.mha2 = Attention(d_model, num_heads, rate, projection_matrix_type=True, nb_random_features=support)
-        self.ffn = point_wise_feed_forward_network(d_model, dff)
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-        self.dropout3 = tf.keras.layers.Dropout(rate)
-
-
-    def call(self, x, enc_output, training):
-        # enc_output.shape == (batch_size, input_seq_len, d_model)
-
-        attn1 = self.mha1(x, None, training=training)    # (batch_size, target_seq_len, d_model)
-        attn1 = self.dropout1(attn1, training=training)
-        out1 = self.layernorm1(attn1 + x)
-
-        attn2 = self.mha2(enc_output, out1, None, training=training)    # (batch_size, target_seq_len, d_model)
-        attn2 = self.dropout2(attn2, training=training)
-        out2 = self.layernorm2(attn2 + out1)    # (batch_size, target_seq_len, d_model)
-
-        ffn_output = self.ffn(out2)    # (batch_size, target_seq_len, d_model)
-        ffn_output = self.dropout3(ffn_output, training=training)
-        out3 = self.layernorm3(ffn_output + out2)    # (batch_size, target_seq_len, d_model)
-
-        return out3
-
-class Encoder(tf.keras.layers.Layer):
-    def __init__(self, num_layers, d_model, num_heads, dff, support=32, rate=0.1, dtype=tf.dtypes.float32):
-        super(Encoder, self).__init__()
-
-        self.d_model = d_model
-        self.num_layers = num_layers
-
-        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate, support=support, dtype=dtype) 
-                                             for _ in range(num_layers)]
-
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, training):
-
-        for i in range(self.num_layers):
-            x = self.enc_layers[i](x, training)
-
-        x = self.dropout(x, training=training)
-        return x    # (batch_size, input_seq_len, d_model)
-
-class Decoder(tf.keras.layers.Layer):
-    def __init__(self, num_layers, d_model, num_heads, dff, support=32, rate=0.1, dtype=tf.dtypes.float32):
-        super(Decoder, self).__init__()
-
-        self.d_model = d_model
-        self.num_layers = num_layers
-
-        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate, support=support, dtype=dtype) 
-                                             for _ in range(num_layers)]
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, enc_output, training):
-
-        for i in range(self.num_layers):
-            x = self.dec_layers[i](x, enc_output, training)
-
-        x = self.dropout(x, training=training)
-
-        # x.shape == (batch_size, target_seq_len, d_model)
-        return x
-
-
-
-class Transformer(tf.keras.Model):
-    def __init__(self,
-                num_layers, d_model, num_heads, dff,
-                dropout=0.1,
-                support=32,
-                num_input_classes=8,
-                num_output_classes=3,
-                num_momentum_outputs=3,
-                dtype=tf.dtypes.float32,
-                skip_connection=False,
-                multi_output=False):
-        super(Transformer, self).__init__()
-
-        self.skip_connection = skip_connection
-        self.multi_output = multi_output
-        self.num_momentum_outputs = num_momentum_outputs
-
-        self.enc = InputEncoding(num_input_classes)
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.ffn_embed_id = point_wise_feed_forward_network(d_model, dff)
-        self.ffn_embed_reg = point_wise_feed_forward_network(d_model, dff)
-
-        self.encoder_id = Encoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-        self.decoder_id = Decoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-
-        self.encoder_reg = Encoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-        self.decoder_reg = Decoder(num_layers, d_model, num_heads, dff, support, dropout, dtype)
-
-        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, dtype=tf.dtypes.float32)
-        self.ffn_charge = point_wise_feed_forward_network(1, dff, dtype=tf.dtypes.float32)
-        self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, dtype=tf.dtypes.float32)
-
-    def call(self, inputs, training):
-        X = inputs
-        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, inputs.dtype), -1)
-
-        enc = self.enc(X)
-        enc = self.layernorm1(enc)
-
-        enc_id = self.ffn_embed_id(enc)
-        enc_reg = self.ffn_embed_reg(enc)
-
-        enc_output_id = self.encoder_id(enc_id, training)
-        enc_output_id = self.layernorm2(enc_output_id)
-        dec_output_id = self.decoder_id(enc_id, enc_output_id, training)
-
-        if self.skip_connection:
-            dec_output_id = tf.concat([enc_id, dec_output_id], axis=-1)
-
-        enc_output_reg = self.encoder_reg(enc_reg, training)
-        enc_output_reg = self.layernorm3(enc_output_reg)
-        dec_output_reg = self.decoder_reg(enc_reg, enc_output_reg, training)
-
-        out_id_logits = self.ffn_id(dec_output_id)*msk_input
-        out_charge = self.ffn_charge(dec_output_id)*msk_input
-
-        if self.skip_connection:
-            dec_output_reg = tf.concat([enc_reg, tf.cast(out_id_logits, X.dtype), dec_output_reg], axis=-1)
-        else:
-            dec_output_reg = tf.concat([tf.cast(out_id_logits, X.dtype), dec_output_reg], axis=-1)
-        pred_momentum = self.ffn_momentum(dec_output_reg)*msk_input
-
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
-        if self.multi_output:
-            return {
-                "cls": out_id_softmax, "charge": out_charge,
-                "pt": pred_momentum[:, :, 0:1],
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": pred_momentum[:, :, 4:5],
-            }
-        else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
-    
-    def set_trainable_classification(self):
-        for layer in self.layers:
-            layer.trainable = True
-        self.encoder_reg.trainable = False
-        self.decoder_reg.trainable = False
-        self.ffn_momentum.trainable = False
-
-    def set_trainable_regression(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.encoder_id.trainable = False
-        self.decoder_id.trainable = False
-        self.ffn_id.trainable = False
-        self.ffn_charge.trainable = False
-
-
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
     
@@ -892,17 +320,15 @@ def __init__(self, *args, **kwargs):
         self.normalize_degrees = kwargs.pop("normalize_degrees")
         self.dropout = kwargs.pop("dropout")
         self.kernel = kwargs.pop("kernel")
+        self.conv_config = kwargs.pop("conv_config")
 
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
 
         self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, self.distance_dim)
         self.dist = GraphBuilderDense(clip_value_low=self.clip_value_low, distance_dim=self.distance_dim, max_num_bins=self.max_num_bins , bin_size=self.bin_size, dist_mult=self.dist_mult, kernel=self.kernel)
-        self.convs = [GHConvDense(
-            activation=tf.keras.activations.elu,
-            output_dim=self.output_dim,
-            normalize_degrees=self.normalize_degrees
-            ) for iconv in range(self.num_conv)
+        self.convs = [
+            get_conv_layer(self.conv_config) for iconv in range(self.num_conv)
         ]
         self.dropout_layer = None
         if self.dropout:
@@ -950,6 +376,7 @@ def __init__(self,
             graph_kernel="gaussian",
             skip_connection=False,
             regression_use_classification=True,
+            conv_config={"type": "GHConvDense", "activation": "elu", "output_dim": 128, "normalize_degrees": True},
             debug=False
         ):
         super(PFNetDense, self).__init__()
@@ -989,7 +416,8 @@ def __init__(self,
             "num_conv": num_conv,
             "normalize_degrees": normalize_degrees,
             "dropout": dropout,
-            "kernel": graph_kernel
+            "kernel": graph_kernel,
+            "conv_config": conv_config
         }
         self.cg_id = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
         self.cg_reg = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
@@ -1048,6 +476,8 @@ def call(self, inputs, training=False):
         dec_input_cls.append(graph_sum)
 
         dec_output_id = tf.concat(dec_input_cls, axis=-1)*msk_input
+        if self.debug:
+            debugging_data["dec_output_id"] = dec_output_id
 
         out_id_logits = self.ffn_id(dec_output_id)*msk_input
 
@@ -1070,6 +500,8 @@ def call(self, inputs, training=False):
         dec_input_reg.append(graph_sum)
 
         dec_output_reg = tf.concat(dec_input_reg, axis=-1)*msk_input
+        if self.debug:
+            debugging_data["dec_output_reg"] = dec_output_reg
 
         if self.separate_momentum:
             pred_momentum = [ffn(dec_output_reg) for ffn in self.ffn_momentum]
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 9aaaaceb9..c2a5fe558 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -1,4 +1,4 @@
-from .model import PFNet, Transformer, DummyNet, PFNetDense
+from .model import DummyNet, PFNetDense
 
 import tensorflow as tf
 import tensorflow_probability
@@ -186,7 +186,7 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variabl
         plt.savefig(str(outpath / "{}_cls{}.png".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
-    def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
+    def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False):
 
         if icls==0:
             sel = self.ytrue_id[msk]!=icls
@@ -196,16 +196,24 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
         vals_pred = ypred[reg_variable][msk][sel].flatten()
         vals_true = self.ytrue[reg_variable][msk][sel].flatten()
 
+        s = ""
+        if log:
+            vals_pred = np.log(vals_pred)
+            vals_true = np.log(vals_true)
+            s = "_log"
+
         plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8)
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
             plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
+            plt.xlim(minval, maxval)
+            plt.ylim(minval, maxval)
 
         plt.xlabel("predicted")
         plt.ylabel("true")
         plt.title(reg_variable)
-        plt.savefig(str(outpath / "{}_cls{}_corr.png".format(reg_variable, icls)), bbox_inches="tight")
+        plt.savefig(str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s)), bbox_inches="tight")
         plt.close("all")
 
     def on_epoch_end(self, epoch, logs=None):
@@ -240,6 +248,7 @@ def on_epoch_end(self, epoch, logs=None):
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
                 self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
                 self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
+            self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
@@ -309,38 +318,6 @@ def make_model(config, dtype):
         return make_gnn_dense(config, dtype)
     raise KeyError("Unknown model type {}".format(model))
 
-def make_gnn(config, dtype):
-    activation = getattr(tf.nn, config['parameters']['activation'])
-
-    parameters = [
-        'bin_size',
-        'num_convs_id',
-        'num_convs_reg',
-        'num_hidden_id_enc',
-        'num_hidden_id_dec',
-        'num_hidden_reg_enc',
-        'num_hidden_reg_dec',
-        'num_neighbors',
-        'hidden_dim_id',
-        'hidden_dim_reg',
-        'dist_mult',
-        'distance_dim',
-        'dropout',
-        'skip_connection'
-    ]
-    kwargs = {par: config['parameters'][par] for par in parameters}
-
-    model = PFNet(
-        multi_output=config["setup"]["multi_output"],
-        num_input_classes=config["dataset"]["num_input_classes"],
-        num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
-        activation=activation,
-        **kwargs
-    )
-
-    return model
-
 def make_gnn_dense(config, dtype):
 
     parameters = [
@@ -358,6 +335,7 @@ def make_gnn_dense(config, dtype):
         "graph_kernel",
         "skip_connection",
         "regression_use_classification",
+        "conv_config",
         "debug"
     ]
 
@@ -373,22 +351,6 @@ def make_gnn_dense(config, dtype):
 
     return model
 
-def make_transformer(config, dtype):
-    parameters = [
-        'num_layers', 'd_model', 'num_heads', 'dff', 'support', 'dropout'
-    ]
-    kwargs = {par: config['parameters'][par] for par in parameters}
-
-    model = Transformer(
-        multi_output=config["setup"]["multi_output"],
-        num_input_classes=config["dataset"]["num_input_classes"],
-        num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
-        dtype=dtype,
-        **kwargs
-    )
-    return model
-
 def make_dense(config, dtype):
     model = DummyNet(
         num_input_classes=config["dataset"]["num_input_classes"],
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 3e390eb2e..946f72099 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -209,7 +209,7 @@ def get_dataset_def(config):
     )
 
 
-def get_train_val_datasets(config, global_batch_size, n_train, n_test):
+def get_train_val_datasets(config, global_batch_size, n_train, n_test, repeat=True):
     dataset_def = get_dataset_def(config)
 
     tfr_files = sorted(glob.glob(dataset_def.processed_path))
@@ -255,11 +255,12 @@ def get_train_val_datasets(config, global_batch_size, n_train, n_test):
     else:
         dataset_transform = None
 
-    ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
-    ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
-
-    return ds_train_r, ds_test_r, dataset_transform
-
+    if repeat:
+        ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
+        ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
+        return ds_train_r, ds_test_r, dataset_transform
+    else:
+        return ds_train, ds_test, dataset_transform
 
 def prepare_val_data(config, dataset_def, single_file=False):
     if single_file:
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index b0a64a127..6378de620 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,18 +29,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<ipython-input-2-30d2deb1d9c5>:2: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
-      "  config = yaml.load(f)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "with open(\"/home/joosep/particleflow/parameters/cms-gnn-dense-dev.yaml\") as f:\n",
     "    config = yaml.load(f)\n",
@@ -50,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -59,18 +50,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "raw files: 0\n",
-      "val files: 1998\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "cds = config[\"dataset\"]\n",
     "\n",
@@ -90,26 +72,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8201_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8202_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8203_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8204_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8205_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8206_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8207_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8208_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_8209_0.pkl.bz2\n",
-      "data/TTbar_14TeV_TuneCUETP8M1_cfi/val/pfntuple_820_0.pkl.bz2\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "Xs = []\n",
     "ygens = []\n",
@@ -132,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -150,32 +115,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<matplotlib.collections.PathCollection at 0x7fac8067a280>"
-      ]
-     },
-     "execution_count": 91,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD4CAYAAADhNOGaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAhrUlEQVR4nO3df4zc9X3n8ecriyGbVqkNuAQWjI3iOknrk92bg+gsJYQCJskJu5SC6aGYHpEvuaRVEsXKWkRtjiZiU6SQ3gVdYlES8uMCCU2crZzIBxguEhenrM8OBiKD+ZHghQQXcKQTLjXmfX/Md+Dr2Znd2fl+vzPzne/rIa125vtj5u3vjr/v+fxWRGBmZtX1hn4HYGZm/eVEYGZWcU4EZmYV50RgZlZxTgRmZhV3Qr8D6Mapp54aS5cu7XcYZmalsnv37n+OiMXN20uZCJYuXcrU1FS/wzAzKxVJv2i13VVDZmYV50RgZlZxTgRmZhXnRGBmVnFOBGZmFZdLryFJtwL/AXguIv6gxX4Bfwe8D3gJuCYi/m+ybyPw6eTQz0bEbXnElKdte6a5ccd+njl8hDMWjrJ57QrWrx7rd1hmNuQa957pw0dm7Htq4v25vU9e3Ue/BnwJ+Hqb/e8Flic/5wH/AzhP0snAXwM1IIDdkiYj4sWc4sps255ptnxvH0eOHgNg+vARtnxvH4CTgVkGZf+ClWf86dda+KYFRMDhI0dnPWfp+PbckkEuVUMR8WPghVkOWQd8Pep2AQslnQ6sBe6KiBeSm/9dwCV5xJSXG3fsfy0JNBw5eowbd+zvU0Rm5df4gjV9+AjB61+wtu2Z7ndoHckz/ubXevGlo3Mmgbz1qo1gDHg69fxgsq3d9hkkbZI0JWnq0KFDhQXa7JkWRbLZtpvZ3Mr+BSvP+Fu9Vq+VprE4IrZGRC0iaosXzxghXZgzFo7Oa7uZza3sX7DyjH8Q/s29SgTTwFmp52cm29ptL8S2PdOsmdjJsvHtrJnY2VExbvPaFYwuGDlu2+iCETavXVFUmGZDr+xfsDqJv9P7zSD8m3uVCCaBD6juncBvIuJZYAdwsaRFkhYBFyfbctdtnd761WPccNlKxhaOImBs4Sg3XLayVI1aZoOm7F+w5op/PvebVj2Cei2v7qPfBs4HTpV0kHpPoAUAEfFl4IfUu44eoN599M+TfS9I+hvggeSlro+I2RqduzZbnd5cN/X1q8d84zfLUeP/Uy97DeXZy2eu+Du53ywd357hXwMLRxdkOj8tl0QQEVfNsT+Aj7TZdytwax5xzKbsdZI2mMreBbKfevkFq4hu4LPFP9f9JmsSeAPwmUt/P9NrNL9eJZS9TtIGT9m7QFZJr3sptbuvBNmTgARfuHJVrkm0Momg33WS3TRU22AraxfIIj+Lg/o573WNQKv7TR5GF4xw0xX5JgEo6cI03ehHnWSDRycPpzJWNxb5WRzkz/kZC0dbNsrmVSPQqorwhstWtp0eYj7GFo4Wfs+qTCKA/jX6ZmmotsFV9M2lCEV+Fgf5c7557YrjkhTkVyPQLgHecNnKTEkgz7mE5lKZqqF+KuM3R5tbv6sbu1HkZ3GQP+dFdgNvlwA/dsferl+zl0kAKlYi6JcyfnMs2jD0tulndWO3ivwsDvrnvKgagTwTXa8TQINLBD1Qxm+ORRqm3jbrV49x//gFPDnxfu4fv2CgkwAU+1ms6uc8r0TXryQALhH0RBm/ORZpkOuSh12Rn8Wqfs6zNgb3MwE0qD7Wq1xqtVpMTU31Owzr0rLx7bT61Al4cgD+U5h1KuuYgF4nAUm7I6LWvN0lAuu5Qa9LNptL2RLAXNxGYD1X1bpkGw7DlgTAJQLrg6rWJVu5DWMCaHAisL7wjK5WJlmSwCAngAYnAjOzNoa5FJDmRFABwzB4y6zXsiaB0QUjbNszXYr/a04EQ26QJwIzG0RZE0BDmcbG5NJrSNIlkvZLOiBpvMX+myTtTX4elXQ4te9Yat9kHvHY68o6VbJZP+SVBBoGYZ6lTmQuEUgaAW4GLgIOAg9ImoyIRxrHRMTHU8f/BbA69RJHImJV1jistUGeCMxsUGRNAGMlHxuTR9XQucCBiHgCQNLtwDrgkTbHX0V9TWPrAQ/eMmsvr8bg5ipYKNfYmDyqhsaAp1PPDybbZpB0NrAM2Jna/EZJU5J2SVrf7k0kbUqOmzp06FAOYVeDB2+ZtZZnj6Aip7nuhV43Fm8A7oyIdKX12RExLekcYKekfRHxePOJEbEV2Ar1uYZ6E275efCW2fEyrxlMvUTd3COozGNj8kgE08BZqednJtta2QB8JL0hIqaT309Iuo96+8GMRGDdK/MH1CxPeTQGp6dOh+HofZdH1dADwHJJyySdSP1mP6P3j6S3AYuAn6S2LZJ0UvL4VGAN7dsWzMy6snR8e6YkMNaiTW2Yet9lTgQR8QrwUWAH8HPgOxHxsKTrJV2aOnQDcHscP+/124EpST8D7gUm0r2NzMyyypIAlv/ub/HUxPuHvvddLm0EEfFD4IdN2/6q6flnWpz3f4CVecRgZpaWZ2PwsPe+8zTUZjZ0sk4S1zxH0LD3vvMUE2Y2NIqaJG7Ye985EZjZUCh6quhh7n3nRGBmpVaVqaKL5ERgZqU17AvG9IoTgVmJeG2JOpcC8uVEYFYSXlvCCaAo7j5qVhJVX1vCSaA4LhGYlcSwj25txwmgeC4RmJVEu1GswzK6tRUngd5wicCsJDavXTGQi58U0YDtBNBbTgRmJTGIo1uLaMB2l9DecyKoKHdDLKdBG906WwP2fON0KaB/nAgqyN0QLS95NWC7FNBfTgQVlOe3OKu2bqdnbpRIW507H90mAZeIj+dEUEFV7YZo+eumAbu5RNqNLKUAl4hnyqX7qKRLJO2XdEDSeIv910g6JGlv8vPB1L6Nkh5LfjbmEY/NrordEK0Y61ePccNlKxlbOIqoL+l4w2UrZ72hfuyOvX1LAuCBea1kLhFIGgFuBi4CDgIPSJpsseTkHRHx0aZzTwb+GqhRXxN6d3Lui1njsvYGtRuilVOnDdif3raPb+76Zdfvk1dbgEvEM+VRNXQucCAingCQdDuwjs4WoV8L3BURLyTn3gVcAnw7h7isjUHshmjDbZB6BA37spPdyCMRjAFPp54fBM5rcdyfSHoX8Cjw8Yh4us25Le9GkjYBmwCWLFmSQ9jVNmjdEG04DVICaHCJeKZeTTHxj8DSiPg3wF3AbfN9gYjYGhG1iKgtXrw49wDNLF+DmASgu3aNYZdHiWAaOCv1/Mxk22si4vnU01uAv02de37TufflEJOZ9cmgJoA0l4iPl0eJ4AFguaRlkk4ENgCT6QMknZ56einw8+TxDuBiSYskLQIuTraZWQmVIQnYTJlLBBHxiqSPUr+BjwC3RsTDkq4HpiJiEvhLSZcCrwAvANck574g6W+oJxOA6xsNx2ZWvLwGVuWdADzgq7cUEf2OYd5qtVpMTU31OwyzUms1sGt0wci868vznh4ir7jaqXKSkbQ7ImrN2z2y2KxE8ryJZZ1qpKhqoCKnQPGo4tacCMxKIu+bWJaBVUVOElfkgC/Ps9WaVygzK4m8p0boZqqRpePbC58ptMgpUDyquDUnArOSyOsmtm3PNGsmdjJ9+Ahq2jfbwKqsCaDTHkGb165gdMFIx3HNh+fZas1VQ2YlkcfUCM3VSwEo+T3Wps2h111Ci5wCxaOKW3MiMCuJPG5iraqXGkng/vELjtvezzEBRQ348jxbrTkRmJVEHjexTquXhnlgmEcVz+REYFYiWW9ic1UvDXMCsPbcWGxWIbM1xDoJVJdLBGYV0qp6afrwET52x96uX9MJoPycCMwqJl29lKUU8OaTRnjwv16SV1jWR04EZhXkaiBLcyKYRZUnp7Lhkv4sZ5lm0glgODkRtOHJqWxYtJrNsxtOAsPLiaANT05lw6LVZ3k+nACGXy7dRyVdImm/pAOSxlvs/4SkRyQ9KOkeSWen9h2TtDf5mWw+t188OZUNg6Xj21uOG+iUk0A1ZC4RSBoBbgYuAg4CD0iajIhHUoftAWoR8ZKkD1Nfs/jKZN+RiFiVNY685TGvi1k/FT1LqA2PPKqGzgUORMQTAJJuB9YBryWCiLg3dfwu4Ooc3rdQnpzKyqosPYLcGWNw5JEIxoCnU88PAufNcvy1wI9Sz98oaYr6esYTEbGt1UmSNgGbAJYsWZIl3o54ciorm7IkAHBnjEHT08ZiSVcDNeDdqc1nR8S0pHOAnZL2RcTjzedGxFZgK9TXLO5FvJ6cysqiTEkA3Blj0OSRCKaBs1LPz0y2HUfShcB1wLsj4uXG9oiYTn4/Iek+YDUwIxGY2UxlSwAN7owxWPLoNfQAsFzSMkknAhuA43r/SFoNfAW4NCKeS21fJOmk5PGpwBpSbQtm1l5ZkwB4pbBBk7lEEBGvSPoosAMYAW6NiIclXQ9MRcQkcCPw28B3JQH8MiIuBd4OfEXSq9ST0kRTbyMza1LmBNDgzhiDRRE9qW7PVa1Wi6mpqX6HYdZzWZLA8t/9Le76xPn5BZORew31nqTdEVFr3u6RxWYlMAylgGbujDE4nAjMBpwHhlnRnAjMBtQwlgJsMDkRmA0glwKsl5wIzAaISwHWD04EZgPCpQDrFycCsz5zKcD6zYnArE/eumU7r2QYxuMEYHlxIjDrA5cCbJA4EZj1UNYE8MUrV3kQluXOicCsR1wKsEHlRGBWMCcAG3S5LF5vZq1lSQJXv3OJk4D1hEsEZgVwKcDKxCUCs5xlSQKjC0b44pWr8gvGrAMuEZjlJGspYMxz8luf5JIIJF0C/B31FcpuiYiJpv0nAV8H/i3wPHBlRDyV7NsCXAscA/4yInbkEZNZL3l6CCuzzIlA0ghwM3ARcBB4QNJk05KT1wIvRsRbJW0APg9cKekd1Nc4/n3gDOBuSb8XEccwKwG3BdgwyKNEcC5wICKeAJB0O7CO4xehXwd8Jnl8J/Al1RcvXgfcHhEvA09KOpC83k9yiMusUC4F2LDIIxGMAU+nnh8Ezmt3TLLY/W+AU5Ltu5rObVlBKmkTsAlgyZIlOYRt1h2XAmzYlKaxOCK2Aluhvnh9n8OxCnICsGGVR/fRaeCs1PMzk20tj5F0AvA71BuNOznXrO+cBGyY5VEieABYLmkZ9Zv4BuDPmo6ZBDZSr/u/HNgZESFpEvifkr5AvbF4OfBPOcRklgsnAKuCzIkgqfP/KLCDevfRWyPiYUnXA1MRMQn8PfCNpDH4BerJguS471BvWH4F+Ih7DNmgcBKwqlBE+arba7VaTE1N9TsMG1JOADasJO2OiFrz9tI0Fpv1gruEWhU5EZjhUoBVmxOBVZ5LAVZ1TgRWWS4FmNU5EVgluRRg9jonAqsUlwLMZnIisEr49LZ9fHPXL7s+3wnAhpkTgQ09lwLMZudEYEPLCcCsM04ENvC27Znmxh37eebwEc7ocDlHJwGzzjkR2EDbtmeaLd/bx5Gj9Smopg8fYcv39gG0TAZOAGbzl8c01GaFuXHH/teSQMORo8e4ccf+GcdmSQLCScCqyyUCG2jPHD4y53aXAsyycSKwgXbGwlGmWySDMxaOAh4YZpYHVw3ZQNu8dgWjC0aO2za6YITpw0ecBMxy4hKBDbRGg3C611CrEkKnnADMZspUIpB0sqS7JD2W/F7U4phVkn4i6WFJD0q6MrXva5KelLQ3+VmVJR4bTutXj3H/+AUEOAmYFSBriWAcuCciJiSNJ88/1XTMS8AHIuIxSWcAuyXtiIjDyf7NEXFnxjhsyLkayKw4WRPBOuD85PFtwH00JYKIeDT1+BlJzwGLgcMZ39sqwD2CzIqXNRGcFhHPJo9/BZw228GSzgVOBB5Pbf6cpL8C7gHGI+LlNuduAjYBLFmyJGPYNujedt0P+Zdj3a+n7QRg1rk5E4Gku4G3tNh1XfpJRISktv9zJZ0OfAPYGBGvJpu3UE8gJwJbqZcmrm91fkRsTY6hVqt1f4fok26mSagqlwLMemvORBARF7bbJ+nXkk6PiGeTG/1zbY57M7AduC4idqVeu1GaeFnSV4FPziv6kpjvNAlV5QRg1h9ZxxFMAhuTxxuBHzQfIOlE4PvA15sbhZPkgSQB64GHMsYzkOYzTUJVOQmY9U/WNoIJ4DuSrgV+AVwBIKkGfCgiPphsexdwiqRrkvOuiYi9wLckLaY+1cte4EMZ4xlInUyTUFVOAGb9lykRRMTzwB+12D4FfDB5/E3gm23OvyDL+5fFXNMkVFXWJCBgzcROt7eYZeQpJnqg3TQJm9eu6FNE/bV0fHumJNC4lo0BZlu+t49te6Zzis6sepwIemD96jFuuGwlYwtHETC2cJQbLltZyW+xWQeGjS0cdXuLWc4811CPrF89Vskbf0NebQFubzHLnxOBFS7P6SHc3mKWP1cNWWGytgW06hHk9haz/LlEUHFFjXjOkgDGFo5y/3jrDmWtpqV2ryGzbJwIKqyIEc9Z2wI6+XZf9fYWs7w5EVTYbCOe53ujfeuW7bySYQYogb/dm/WJE0GF5dUDx6ODzcrNiaDCsvbAcQIwGw7uNVRhWXrgOAmYDY9KlwiqvkZANz1wnADMhk9lE4HXCKibTw+cLEngBMGBG5wEzAZRZRNBnj1mhp1LAfNX9dKmlUtlE4HnrOlM3iODq8ClTSubTI3Fkk6WdJekx5Lfi9ocd0zS3uRnMrV9maSfSjog6Y5kNbOeaNczxnPW1BUxPURVeEU6K5usvYbGgXsiYjlwT/K8lSMRsSr5uTS1/fPATRHxVuBF4NqM8XTMc9a0lzUBVDkJgEubVj5ZE8E64Lbk8W3U1x3uSLJO8QVAYx3jeZ2fldcImMmlgHy4tGllk7WN4LSIeDZ5/CvgtDbHvVHSFPAKMBER24BTgMMR8UpyzEGg7V1Y0iZgE8CSJUsyhl3nOWte5wSQn81rVxzXRgAubdpgmzMRSLobeEuLXdeln0RESGo328zZETEt6Rxgp6R9wG/mE2hEbAW2AtRqtQyz2liaewTlzzOkWtnMmQgi4sJ2+yT9WtLpEfGspNOB59q8xnTy+wlJ9wGrgX8AFko6ISkVnAl44dkecQIolkubViZZ2wgmgY3J443AD5oPkLRI0knJ41OBNcAjERHAvcDls51v+XMSMLO0rG0EE8B3JF0L/AK4AkBSDfhQRHwQeDvwFUmvUk88ExHxSHL+p4DbJX0W2AP8fcZ4hk6eA5OcAMysFdW/mJdLrVaLqampfodRuOaBSVBvdOymd1NeScAjZs3KS9LuiKg1b6/syOIyyGMajKwJQMCTqSTgEbNmw8fTUA+wrAOTsiYBOL7vu0fMmg0nlwgGWLcLx+SRAGBm33ePmDUbTi4RDLDNa1ewYETHbVswolkHJmVJAl+8ctWsI609YtZsOLlEMOia2/LbtO3n1Rg8W12/R8yaDScnggF24479HH31+Dv/0VdjRmNxr6aH8IhZs+HkRDDA5qqT78e4AI+YNRs+TgQDrF1jsfAkcWaWHzcW52jbnmnWTOxk2fh21kzsZNuebFMntVozAeDVDK/pJGBmzVwiyEkRg63SdfKtSgbz4QRgZu24RJCTogZbrV895iRgZoVyiSAnRQy28iRxZtYLLhHkJO/BVk4CZtYrLhHkJK/BVk4AZtZrTgQ5yWOwlbuEmlk/OBHkqNvBVi4FmFk/ZUoEkk4G7gCWAk8BV0TEi03HvAe4KbXpbcCGiNgm6WvAu3l9IftrImJvlpjKxqUAM+u3rCWCceCeiJiQNJ48/1T6gIi4F1gFryWOA8D/Sh2yOSLuzBhHTwzyspFeOczMupU1EawDzk8e3wbcR1MiaHI58KOIeCnj+/ZcngPG8i4FtIrt43fsZeoXL/DZ9Ss7fm0nE7Nqytp99LSIeDZ5/CvgtDmO3wB8u2nb5yQ9KOkmSSe1O1HSJklTkqYOHTqUIeTu5DFgbOn49kKqglrFFsC3dv2y42kuGslk+vARgtcTXdZpMsxs8M2ZCCTdLemhFj/r0sdFRNB2tnyQdDqwEtiR2ryFepvBvwNOZpbSRERsjYhaRNQWL148V9i5yzJg7KIv3JcpAYwuGOGLV65qu79dDAEdJyovQ2lWXXNWDUXEhe32Sfq1pNMj4tnkRv/cLC91BfD9iDiaeu1GaeJlSV8FPtlh3D3Xz2Uj51qwvl1s0PnIZi9DaVZdWauGJoGNyeONwA9mOfYqmqqFkuSBJAHrgYcyxlOYVjOBzjZgLGs1ULPZbsib165AbfZ1OrLZy1CaVVfWRDABXCTpMeDC5DmSapJuaRwkaSlwFvC/m87/lqR9wD7gVOCzGeMpzPrVY9xw2cpZ1/RtyDMBNMx2Q16/eoz/+M4lM5LBfEY2zzfRmdnwUL1qv1xqtVpMTU31O4wZ8ugS2twDCOo35HZJJy1rrx/3GjIbbpJ2R0RtxnYngnzkOS7AN2QzK0K7ROApJjIqYnoIrwtsZr3kRNBGJ9/KPT2EmQ0DJ4IW5hpF3OtJ4lxVZGZFciJoYbbBVR+7Y2/Xr9tNKaCItZDNzNK8QlkL7frsZ1k7uNuqII/4NbOiuUTQwmwjdecra1uAR/yaWdGcCBLpeviFb1rAgjeIo69m61qbR4Nwt1NbmJl1yomAmfXwL750dI4zZpdnj6C81kLuNzd4mw2uSieCxs0pr2ogyL9baB5rIfebG7zNBltlE0GrqRyyKHJcQNkHmM3W4F3mf5fZsKhsr6FWN6dueXDY7NzgbTbYKlsiyOMm5IFhnXGDt1l7g3BfqEwi2LZnms9MPszhI9kaghvGFo6ybc90x3+wKteTD0uDt1neBuW+UImqoW17ptn83Z/llgRg/mv6Vnlg2HzWcjCrkkG5L1SiRHDjjv2ZxwSMtajemE+DZ9Xrycve4G1WhNlmMVgzsbNn1UWZSgSS/lTSw5JelTRjjuvUcZdI2i/pgKTx1PZlkn6abL9D0olZ4mkn681WtJ9eotPX9lKQr9u2Z5o1EztZNr6dNRM7Oy5VmQ2bdv//G/ecYP61D93IWjX0EHAZ8ON2B0gaAW4G3gu8A7hK0juS3Z8HboqItwIvAtdmjKelbssCjaUbZzu/0xu5l4Ksa9SJ9vJDbjaoWt0XxMx7TtHVRZkSQUT8PCLmiu5c4EBEPBER/wrcDqxLFqy/ALgzOe426gvYD4QRac7upfO5kbuevG5Q6kTNBkGr+0K7L55FViP3oo1gDHg69fwgcB5wCnA4Il5JbW97V5S0CdgEsGTJkmIiTTk2yxKegq7q7VxP7rYSs2bN94U1Ezt73t16zkQg6W7gLS12XRcRP8g/pNYiYiuwFeprFhf9fiNSy2QwtnCU+8cvKPrth5bHFJjNrh/dreesGoqICyPiD1r8dJoEpoGzUs/PTLY9DyyUdELT9p5rVXd/1XlnuU6/AG4rMZtdP6qRe1E19ACwXNIy6jf6DcCfRURIuhe4nHq7wUagkBLGUxPvb7u85FMT7287sq929sl9H/E3bIZhEj2zovW6GlkxS134nCdLfwz8d2AxcBjYGxFrJZ0B3BIR70uOex/wRWAEuDUiPpdsP4d6EjgZ2ANcHREvz/W+tVotpqamuo7bzKyKJO2OiBld/TMlgn5xIjAzm792iaASU0yYmVl7TgRmZhXnRGBmVnFOBGZmFVfKxmJJh4BfdHn6qcA/5xhOXhzX/Diu+XFc8zOscZ0dEYubN5YyEWQhaapVq3m/Oa75cVzz47jmp2pxuWrIzKzinAjMzCquiolga78DaMNxzY/jmh/HNT+ViqtybQRmZna8KpYIzMwsxYnAzKzihjIRSPpTSQ9LelVS265Wki6RtF/SAUnjqe3LJP002X6HpBNziutkSXdJeiz5vajFMe+RtDf18y+S1if7vibpydS+Vb2KKznuWOq9J1Pb+3m9Vkn6SfL3flDSlal9uV6vdp+X1P6Tkn//geR6LE3t25Js3y9pbZY4uojrE5IeSa7PPZLOTu1r+TftUVzXSDqUev8PpvZtTP7uj0na2OO4bkrF9Kikw6l9hVwvSbdKek7SQ232S9J/S2J+UNIfpvZlv1YRMXQ/wNuBFcB9QK3NMSPA48A5wInAz4B3JPu+A2xIHn8Z+HBOcf0tMJ48Hgc+P8fxJwMvAG9Knn8NuLyA69VRXMD/a7O9b9cL+D1gefL4DOBZYGHe12u2z0vqmP8CfDl5vAG4I3n8juT4k4BlyeuM9DCu96Q+Qx9uxDXb37RHcV0DfKnFuScDTyS/FyWPF/Uqrqbj/4L61PlFX693AX8IPNRm//uAH1FfKfedwE/zvFZDWSKIiJ9HxFyroZ8LHIiIJyLiX6mvi7BOkoALgDuT424D1ucU2rrk9Tp93cuBH0XESzm9fzvzjes1/b5eEfFoRDyWPH4GeI76+hh5a/l5mSXeO4E/Sq7POuD2iHg5Ip4EDiSv15O4IuLe1GdoF/XVAIvWyfVqZy1wV0S8EBEvAncBl/QprquAb+f03m1FxI+pf+lrZx3w9ajbRX11x9PJ6VoNZSLo0BjwdOr5wWTbKcDhiHilaXseTouIZ5PHvwJOm+P4Dcz8EH4uKRreJOmkHsf1RklTknY1qqsYoOsl6Vzq3/IeT23O63q1+7y0PCa5Hr+hfn06ObfIuNKupf7NsqHV37SXcf1J8ve5U1JjSduBuF5JFdoyYGdqc1HXay7t4s7lWvViqcpCSLobeEuLXddF5+sp5262uNJPIiIkte27m2T7lcCO1OYt1G+IJ1LvT/wp4PoexnV2REyrvrLcTkn7qN/supbz9foGsDEiXk02d329hpGkq4Ea8O7U5hl/04h4vPUr5O4fgW9HxMuS/jP10tQFPXrvTmwA7oyIY6lt/bxehSltIoiICzO+xDRwVur5mcm256kXu05IvtU1tmeOS9KvJZ0eEc8mN67nZnmpK4DvR8TR1Gs3vh2/LOmrwCd7GVdETCe/n5B0H7Aa+Af6fL0kvRnYTv1LwK7Ua3d9vVpo93lpdcxBSScAv0P989TJuUXGhaQLqSfXd0dqOdg2f9M8bmxzxhURz6ee3kK9Tahx7vlN596XQ0wdxZWyAfhIekOB12su7eLO5VpVuWroAWC56j1eTqT+R5+MegvMvdTr5wE2AnmVMCaT1+vkdWfUTSY3w0a9/HqgZQ+DIuKStKhRtSLpVGAN8Ei/r1fyt/s+9frTO5v25Xm9Wn5eZon3cmBncn0mgQ2q9ypaBiwH/ilDLPOKS9Jq4CvApRHxXGp7y79pD+M6PfX0UuDnyeMdwMVJfIuAizm+ZFxoXElsb6Pe+PqT1LYir9dcJoEPJL2H3gn8Jvmik8+1KqIFvN8/wB9Tryt7Gfg1sCPZfgbww9Rx7wMepZ7Rr0ttP4f6f9QDwHeBk3KK6xTgHuAx4G7g5GR7DbglddxS6pn+DU3n7wT2Ub+hfRP47V7FBfz75L1/lvy+dhCuF3A1cBTYm/pZVcT1avV5oV7VdGny+I3Jv/9Acj3OSZ17XXLefuC9OX/e54rr7uT/QeP6TM71N+1RXDcADyfvfy/wttS5/ym5jgeAP+9lXMnzzwATTecVdr2of+l7NvksH6TelvMh4EPJfgE3JzHvI9UbMo9r5SkmzMwqrspVQ2ZmhhOBmVnlORGYmVWcE4GZWcU5EZiZVZwTgZlZxTkRmJlV3P8HgKVZJniedHAAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==1\n",
     "plt.scatter(vals_a[msk], vals_b[msk])"
@@ -188,9 +130,9 @@
    "outputs": [],
    "source": [
     "ret = model(X_val[:1])\n",
-    "model.set_trainable_classification()\n",
-    "model.load_weights(\"/home/joosep/particleflow/experiments/cms-gnn-dense-a301aa09.gpu0.local/weights-65-103.547722.hdf5\")\n",
-    "ret = model.predict(X_val, batch_size=10)"
+    "#model.set_trainable_classification()\n",
+    "model.load_weights(\"/home/joosep/particleflow/experiments/cms-gnn-dense-dev_20210819_101049.joosep-desktop/weights/weights-10-125.094849.hdf5\")\n",
+    "ret = model.predict(X_val, batch_size=1)"
    ]
   },
   {
@@ -223,7 +165,7 @@
     "\n",
     "    for ielem in range(6400):\n",
     "        if X_val[0, ielem, 0] != 0:\n",
-    "            for ibin in range(bs.shape[1]):\n",
+    "            for ibin in range(bs.shape[0]):\n",
     "                if ielem in bs[ibin]:\n",
     "                    bin_index.append(ibin)\n",
     "                    break\n",
@@ -232,6 +174,71 @@
     "    return bin_index"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.cg_id[0].name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dd = preds[\"dec_output_id\"][0, :, 50:].numpy().flatten()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(dd, bins=100);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10,10))\n",
+    "plt.imshow(preds[\"dec_output_reg\"][0, :, 50:], cmap=\"Blues\")\n",
+    "plt.colorbar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.ffn_momentum[4].summary()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -244,7 +251,7 @@
     "typ = X_val[0][msk, 0]\n",
     "energy = X_val[0][msk, 4]\n",
     "\n",
-    "evenly_spaced_interval = np.linspace(0, 1, 10)\n",
+    "evenly_spaced_interval = np.linspace(0, 1, preds[\"combined_graph_layer\"][\"bins\"].shape[1])\n",
     "colorlist = [cm.rainbow(x) for x in evenly_spaced_interval]\n",
     "bin_idx = get_bin_index(preds[\"combined_graph_layer\"][\"bins\"][0].numpy())\n",
     "\n",
@@ -262,7 +269,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "evenly_spaced_interval = np.linspace(0, 1, 10)\n",
+    "evenly_spaced_interval = np.linspace(0, 1,  preds[\"combined_graph_layer_1\"][\"bins\"].shape[1])\n",
     "colorlist = [cm.rainbow(x) for x in evenly_spaced_interval]\n",
     "bin_idx = get_bin_index(preds[\"combined_graph_layer_1\"][\"bins\"][0].numpy())\n",
     "\n",
@@ -314,8 +321,8 @@
    "source": [
     "def plot_dms(dms):\n",
     "    fig = plt.figure(figsize=(4*4, 3*4))\n",
-    "    for i in range(dmn.shape[0]):\n",
-    "        ax = plt.subplot(3,4,i+1)\n",
+    "    for i in range(25):\n",
+    "        ax = plt.subplot(5,5,i+1)\n",
     "        plt.axes(ax)\n",
     "        plt.imshow(dmn[i], interpolation=\"none\", norm=matplotlib.colors.Normalize(vmin=0, vmax=1), cmap=\"Blues\")\n",
     "        plt.colorbar()\n",
@@ -325,6 +332,34 @@
     "    plt.tight_layout()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dmnf = dmn.flatten()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(dmnf[dmnf!=0], bins=100);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(dmn[1])\n",
+    "plt.colorbar()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -377,6 +412,43 @@
     "plt.savefig(\"dm_reg2.pdf\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arr = tf.random.normal((2,160,40,40,32))\n",
+    "msk = tf.cast(tf.random.normal((2,160,40,))>0.5, tf.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(arr[0, 0, :, :, 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(tf.einsum(\"abijk,abi->abijk\", arr, msk)[0,0, :, :, 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(tf.einsum(\"abijk,abj->abijk\", arr, msk)[0,0,:, :, 0])"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 8ee75c77e..cf189a927 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -49,10 +49,10 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 2
+  batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 100
+  num_epochs: 10
   num_val_files: 10
   dtype: float32
   trainable: all
@@ -71,12 +71,18 @@ sample_weights:
 parameters:
   model: gnn_dense  
   activation: elu
-  layernorm: yes
-  hidden_dim: 128
+  layernorm: no
+  hidden_dim: 256
   bin_size: 40
   clip_value_low: 0.0
-  num_conv: 2
-  num_gsl: 5
+  num_conv: 1
+  conv_config:
+    type: MPNNNodeFunction
+    output_dim: 128
+    hidden_dim: 128
+    num_layers: 2
+    activation: elu
+  num_gsl: 3
   normalize_degrees: yes
   distance_dim: 16
   dropout: 0.1
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 23569e4ee..8bfb995a5 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -56,9 +56,13 @@ parameters:
   hidden_dim: 128
   bin_size: 320
   clip_value_low: 0.0
+  conv_config:
+    type: GHConvDense
+    output_dim: 128
+    activation: elu
+    normalize_degrees: yes
   num_conv: 1
   num_gsl: 1
-  normalize_degrees: yes
   distance_dim: 128
   dropout: 0.0
   separate_momentum: yes
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
deleted file mode 100644
index a6e4f1967..000000000
--- a/parameters/test-cms.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-  num_input_classes: 12
-  #(none, ch.had, n.had, hfem, hfhad, gamma, e, mu)
-  num_output_classes: 8
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-  
-parameters:
-  model: gnn
-  bin_size: 64
-  num_convs_id: 1
-  num_convs_reg: 1
-  num_hidden_id_enc: 1
-  num_hidden_id_dec: 0
-  num_hidden_reg_enc: 1
-  num_hidden_reg_dec: 0
-  num_neighbors: 16 
-  hidden_dim_id: 64
-  hidden_dim_reg: 64
-  distance_dim: 64
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 2f10ec7e2..93b022d84 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -32,15 +32,12 @@ rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
-python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action data
 
 #Run a simple training on a few events
-python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
+python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-
 
-#Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms.yaml -t ./experiments/test-cms-*
+#Generate the predictions
+python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-*
 
-python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
-
-python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-v2-
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-v2-*
+#thest that the frozen graph can be generated and loaded
+python3 scripts/test_load_tfmodel.py ./experiments/test-cms-v2-*/model_frozen/frozen_graph.pb

From 6199a25d9b090134784d0d1fe07489c034fa5157 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 14:16:37 +0300
Subject: [PATCH 050/157] readd previous net

---
 mlpf/tfmodel/model.py         | 376 +++++++++++++++++++++++++++++++++-
 mlpf/tfmodel/model_setup.py   |  34 ++-
 parameters/cms-gnn-dense.yaml |   5 +
 3 files changed, 412 insertions(+), 3 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index cc46559fb..dac165eb9 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -169,7 +169,42 @@ def call(self, X):
         )
 
 #https://arxiv.org/pdf/2004.04635.pdf
-#https://github.com/gcucurull/jax-ghnet/blob/master/models.py 
+#https://github.com/gcucurull/jax-ghnet/blob/master/models.py
+class GHConv(tf.keras.layers.Layer):
+    def __init__(self, *args, **kwargs):
+        self.activation = kwargs.pop("activation")
+
+        super(GHConv, self).__init__(*args, **kwargs)
+
+    def build(self, input_shape):
+        self.hidden_dim = input_shape[0][-1]
+        self.nelem = input_shape[0][-2]
+        self.W_t = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
+        self.b_t = self.add_weight(shape=(self.hidden_dim,), name="b_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
+        self.W_h = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_h", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
+        self.theta = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="theta", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
+ 
+    #@tf.function
+    def call(self, inputs):
+        x, adj = inputs
+
+        #compute the normalization of the adjacency matrix
+        in_degrees = tf.sparse.reduce_sum(tf.abs(adj), axis=-1)
+        in_degrees = tf.reshape(in_degrees, (tf.shape(x)[0], tf.shape(x)[1]))
+
+        #add epsilon to prevent numerical issues from 1/sqrt(x)
+        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
+
+        f_hom = tf.linalg.matmul(x, self.theta)
+        f_hom = sparse_dense_matmult_batch(adj, f_hom*norm)*norm
+
+        f_het = tf.linalg.matmul(x, self.W_h)
+        gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
+
+        out = gate*f_hom + (1-gate)*f_het
+        return self.activation(out)
+
+
 class GHConvDense(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         self.activation = kwargs.pop("activation")
@@ -245,6 +280,145 @@ def get_conv_layer(config_dict):
 
     return conv_cls(**config_dict)
 
+
+class SparseHashedNNDistance(tf.keras.layers.Layer):
+    def __init__(self, distance_dim=128, max_num_bins=200, bin_size=500, num_neighbors=5, dist_mult=0.1, **kwargs):
+        super(SparseHashedNNDistance, self).__init__(**kwargs)
+        self.num_neighbors = tf.constant(num_neighbors)
+        self.dist_mult = dist_mult
+        self.distance_dim = distance_dim
+
+        #generate the codebook for LSH hashing at model instantiation for up to this many bins
+        #set this to a high-enough value at model generation to take into account the largest possible input 
+        self.max_num_bins = tf.constant(max_num_bins)
+
+        #each bin will receive this many input elements, in total we can accept max_num_bins*bin_size input elements
+        #in each bin, we will do a dense top_k evaluation
+        self.bin_size = bin_size
+        self.layer_encoding = point_wise_feed_forward_network(distance_dim, 128)
+        self.layer_edge = point_wise_feed_forward_network(1, 128)
+
+    def build(self, input_shape):
+        #(n_batch, n_points, n_features)
+
+        #generate the LSH codebook for random rotations (num_features, max_num_bins/2)
+        self.codebook_random_rotations = self.add_weight(
+            shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal", trainable=False, name="lsh_projections"
+        )
+
+    #@tf.function
+    def call(self, inputs, training=True):
+
+        #(n_batch, n_points, n_features)
+        point_embedding = self.layer_encoding(inputs)
+        
+        n_batches = tf.shape(point_embedding)[0]
+        n_points = tf.shape(point_embedding)[1]
+        #points_neighbors = n_points * self.num_neighbors
+
+        #cannot concat sparse tensors directly as that incorrectly destroys the gradient, see
+        #https://github.com/tensorflow/tensorflow/blob/df3a3375941b9e920667acfe72fb4c33a8f45503/tensorflow/python/ops/sparse_grad.py#L33
+        def func(args):
+            ibatch, points_batch = args[0], args[1]
+            bins_split, (inds, vals) = self.construct_sparse_dm_batch(points_batch)
+            inds = tf.concat([tf.expand_dims(tf.cast(ibatch, tf.int64)*tf.ones(tf.shape(inds)[0], dtype=tf.int64), -1), inds], axis=-1)
+            return inds, vals, bins_split
+
+        elems = (tf.range(0, n_batches, delta=1, dtype=tf.int64), point_embedding)
+        ret = tf.map_fn(func, elems,
+            fn_output_signature=(
+                tf.TensorSpec((None, 3), tf.int64),
+                tf.TensorSpec((None, ), inputs.dtype),
+                tf.TensorSpec((None, self.bin_size), tf.int32),
+            ),
+            parallel_iterations=2, back_prop=True
+        )
+
+        # #now create a new SparseTensor that is a concatenation of the per-batch tensor indices and values
+        shp = tf.shape(ret[0])
+        dms = tf.SparseTensor(
+            tf.reshape(ret[0], (shp[0]*shp[1], shp[2])),
+            tf.reshape(ret[1], (shp[0]*shp[1],)),
+            (n_batches, n_points, n_points)
+        )
+
+        dm = tf.sparse.reorder(dms)
+
+        i1 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 1]]))
+        i2 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 2]]))
+        x1 = tf.gather_nd(inputs, i1)
+        x2 = tf.gather_nd(inputs, i2)
+
+        #run an edge net on (src node, dst node, edge)
+        edge_vals = tf.nn.sigmoid(self.layer_edge(tf.concat([x1, x2, tf.expand_dims(dm.values, axis=-1)], axis=-1)))
+        dm2 = tf.sparse.SparseTensor(indices=dm.indices, values=edge_vals[:, 0], dense_shape=dm.dense_shape)
+
+        return dm2, ret[2]
+
+    #@tf.function
+    def subpoints_to_sparse_matrix(self, subindices, subpoints):
+
+        #find the distance matrix between the given points in all the LSH bins
+        dm = pairwise_gaussian_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
+        dm = tf.exp(-self.dist_mult*dm)
+
+        #dm = pairwise_sigmoid_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
+
+        dmshape = tf.shape(dm)
+        nbins = dmshape[0]
+        nelems = dmshape[1]
+
+        #run KNN in the dense distance matrix, accumulate each index pair into a sparse distance matrix
+        top_k = tf.nn.top_k(dm, k=self.num_neighbors)
+        top_k_vals = tf.reshape(top_k.values, (nbins*nelems, self.num_neighbors))
+
+        indices_gathered = tf.map_fn(
+            lambda i: tf.gather_nd(subindices, top_k.indices[:, :, i:i+1], batch_dims=1),
+            tf.range(self.num_neighbors, dtype=tf.int32), fn_output_signature=tf.TensorSpec(None, tf.int32)
+        )
+        indices_gathered = tf.transpose(indices_gathered, [1,2,0])
+
+        def func(i):
+           dst_ind = indices_gathered[:, :, i] #(nbins, nelems)
+           dst_ind = tf.reshape(dst_ind, (nbins*nelems, ))
+           src_ind = tf.reshape(tf.stack(subindices), (nbins*nelems, ))
+           src_dst_inds = tf.cast(tf.transpose(tf.stack([src_ind, dst_ind])), dtype=tf.int64)
+           return src_dst_inds, top_k_vals[:, i]
+
+        ret = tf.map_fn(func, tf.range(0, self.num_neighbors, delta=1, dtype=tf.int32), fn_output_signature=(tf.int64, subpoints.dtype))
+        
+        shp = tf.shape(ret[0])
+        inds = tf.reshape(ret[0], (shp[0]*shp[1], 2))
+        vals = tf.reshape(ret[1], (shp[0]*shp[1],))
+        return inds, vals
+
+    def construct_sparse_dm_batch(self, points):
+        #points: (n_points, n_features) input elements for graph construction
+        n_points = tf.shape(points)[0]
+        n_features = tf.shape(points)[1]
+
+        #compute the number of LSH bins to divide the input points into on the fly
+        #n_points must be divisible by bin_size exactly due to the use of reshape
+        n_bins = tf.math.floordiv(n_points, self.bin_size)
+
+        #put each input item into a bin defined by the softmax output across the LSH embedding
+        mul = tf.linalg.matmul(points, self.codebook_random_rotations[:, :n_bins//2])
+        cmul = tf.concat([mul, -mul], axis=-1)
+
+        #cmul is now an integer in [0..nbins) for each input point
+        #bins_split: (n_bins, bin_size) of integer bin indices, which puts each input point into a bin of size (n_points/n_bins)
+        bins_split = split_indices_to_bins(cmul, n_bins, self.bin_size)
+
+        #parts: (n_bins, bin_size, n_features), the input points divided up into bins
+        parts = tf.gather(points, bins_split)
+
+        #sparse_distance_matrix: (n_points, n_points) sparse distance matrix
+        #where higher values (closer to 1) are associated with points that are closely related
+        sparse_distance_matrix = self.subpoints_to_sparse_matrix(bins_split, parts)
+
+        return bins_split, sparse_distance_matrix
+
+
 class GraphBuilderDense(tf.keras.layers.Layer):
     def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_size=128, dist_mult=0.1, **kwargs):
         self.dist_mult = dist_mult
@@ -304,6 +478,206 @@ def call(self, x_dist, x_features, msk):
 
         return bins_split, x_features_binned, dm, msk_f_binned
 
+
+class EncoderDecoderGNN(tf.keras.layers.Layer):
+    def __init__(self, encoders, decoders, dropout, activation, conv, **kwargs):
+        super(EncoderDecoderGNN, self).__init__(**kwargs)
+        name = kwargs.get("name")
+
+        #assert(encoders[-1] == decoders[0])
+        self.encoders = encoders
+        self.decoders = decoders
+
+        self.encoding_layers = []
+        for ilayer, nunits in enumerate(encoders):
+            self.encoding_layers.append(
+                tf.keras.layers.Dense(nunits, activation=activation,
+                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
+                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
+                    name="encoding_{}_{}".format(name, ilayer)))
+            if dropout > 0.0:
+                self.encoding_layers.append(tf.keras.layers.Dropout(dropout))
+
+        self.conv = conv
+
+        self.decoding_layers = []
+        for ilayer, nunits in enumerate(decoders):
+            self.decoding_layers.append(
+                tf.keras.layers.Dense(nunits, activation=activation,
+                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
+                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
+                    name="decoding_{}_{}".format(name, ilayer)))
+            if dropout > 0.0:
+                self.decoding_layers.append(tf.keras.layers.Dropout(dropout))
+
+    @tf.function
+    def call(self, inputs, distance_matrix, training=True):
+        x = inputs
+
+        for layer in self.encoding_layers:
+            x = layer(x)
+
+        for convlayer in self.conv:
+            x = convlayer([x, distance_matrix])
+
+        for layer in self.decoding_layers:
+            x = layer(x)
+
+        return x
+
+class AddSparse(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super(AddSparse, self).__init__(**kwargs)
+
+    def call(self, matrices):
+        ret = matrices[0]
+        for mat in matrices[1:]:
+            ret = tf.sparse.add(ret, mat)
+        return ret
+
+#Simple message passing based on a matrix multiplication
+class PFNet(tf.keras.Model):
+    def __init__(self,
+        multi_output=False,
+        num_input_classes=8,
+        num_output_classes=3,
+        num_momentum_outputs=3,
+        activation=tf.nn.selu,
+        hidden_dim_id=256,
+        hidden_dim_reg=256,
+        distance_dim=256,
+        convlayer="ghconv",
+        dropout=0.1,
+        bin_size=10,
+        num_convs_id=1,
+        num_convs_reg=1,
+        num_hidden_id_enc=1,
+        num_hidden_id_dec=1,
+        num_hidden_reg_enc=1,
+        num_hidden_reg_dec=1,
+        num_neighbors=5,
+        dist_mult=0.1,
+        skip_connection=False,
+        return_matrix=False):
+
+        super(PFNet, self).__init__()
+        self.activation = activation
+        self.num_dists = 1
+        self.num_momentum_outputs = num_momentum_outputs
+        self.skip_connection = skip_connection
+        self.multi_output = multi_output
+        self.return_matrix = return_matrix
+
+        encoding_id = []
+        decoding_id = []
+        encoding_reg = []
+        decoding_reg = []
+
+        #the encoder outputs and decoder inputs have to have the hidden dim (convlayer size)
+        for ihidden in range(num_hidden_id_enc):
+            encoding_id.append(hidden_dim_id)
+
+        for ihidden in range(num_hidden_id_dec):
+            decoding_id.append(hidden_dim_id)
+
+        for ihidden in range(num_hidden_reg_enc):
+            encoding_reg.append(hidden_dim_reg)
+
+        for ihidden in range(num_hidden_reg_dec):
+            decoding_reg.append(hidden_dim_reg)
+
+        self.enc = InputEncoding(num_input_classes)
+        #self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+
+        self.dist = SparseHashedNNDistance(distance_dim=distance_dim, bin_size=bin_size, num_neighbors=num_neighbors, dist_mult=dist_mult)
+
+        convs_id = []
+        convs_reg = []
+        if convlayer == "sgconv":
+            for iconv in range(num_convs_id):
+                convs_id.append(SGConv(k=1, activation=activation, name="conv_id{}".format(iconv)))
+            for iconv in range(num_convs_reg):
+                convs_reg.append(SGConv(k=1, activation=activation, name="conv_reg{}".format(iconv)))
+        elif convlayer == "ghconv":
+            for iconv in range(num_convs_id):
+                convs_id.append(GHConv(activation=activation, name="conv_id{}".format(iconv)))
+            for iconv in range(num_convs_reg):
+                convs_reg.append(GHConv(activation=activation, name="conv_reg{}".format(iconv)))
+
+        self.gnn_id = EncoderDecoderGNN(encoding_id, decoding_id, dropout, activation, convs_id, name="gnn_id")
+        self.layer_id = point_wise_feed_forward_network(num_output_classes, hidden_dim_id, num_layers=3, activation=activation)
+        self.layer_charge = point_wise_feed_forward_network(1, hidden_dim_id, num_layers=3, activation=activation)
+        
+        self.gnn_reg = EncoderDecoderGNN(encoding_reg, decoding_reg, dropout, activation, convs_reg, name="gnn_reg")
+        self.layer_momentum = point_wise_feed_forward_network(num_momentum_outputs, hidden_dim_reg, num_layers=3, activation=activation)
+
+    # def create_model(self, num_max_elems, num_input_features, training=True):
+    #     inputs = tf.keras.Input(shape=(num_max_elems, num_input_features,))
+    #     return tf.keras.Model(inputs=[inputs], outputs=self.call(inputs, training), name="MLPFNet")
+
+    def call(self, inputs, training=True):
+        X = inputs
+        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, tf.dtypes.float32), -1)
+
+        enc = self.enc(inputs)
+
+        #create a graph structure from the encoded nodes
+        dm, bins = self.dist(enc, training)
+
+        #run graph net for multiclass id prediction
+        x_id = self.gnn_id(enc, dm, training)
+        
+        if self.skip_connection:
+            to_decode = tf.concat([enc, x_id], axis=-1)
+        else:
+            to_decode = tf.concat([x_id], axis=-1)
+
+        out_id_logits = self.layer_id(to_decode)*msk_input
+        out_charge = self.layer_charge(to_decode)*msk_input
+
+        #run graph net for regression output prediction, taking as an additonal input the ID predictions
+        x_reg = self.gnn_reg(tf.concat([enc, tf.cast(out_id_logits, X.dtype)], axis=-1), dm, training)
+
+        if self.skip_connection:
+            to_decode = tf.concat([enc, tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
+        else:
+            to_decode = tf.concat([tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
+
+        pred_momentum = self.layer_momentum(to_decode)*msk_input
+
+        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
+        out_charge = tf.clip_by_value(out_charge, -2, 2)
+
+        if self.multi_output:
+            ret = {
+                "cls": out_id_softmax,
+                "charge": out_charge,
+                "pt": tf.exp(tf.clip_by_value(pred_momentum[:, :, 0:1], -4, 4)),
+                "eta": pred_momentum[:, :, 1:2],
+                "sin_phi": pred_momentum[:, :, 2:3],
+                "cos_phi": pred_momentum[:, :, 3:4],
+                "energy": tf.exp(tf.clip_by_value(pred_momentum[:, :, 4:5], -5, 6))
+            }
+            if self.return_matrix:
+                ret["dm"] = dm
+                ret["bins"] = bins
+            return ret
+        else:
+            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
+
+    def set_trainable_classification(self):
+        for layer in self.layers:
+            layer.trainable = False
+        self.gnn_id.trainable = True
+        self.layer_id.trainable = True
+
+    def set_trainable_regression(self):
+        for layer in self.layers:
+            layer.trainable = False
+        self.gnn_reg.trainable = True
+        self.layer_momentum.trainable = True
+
+        
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
     
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index c2a5fe558..d8e9384c3 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -310,14 +310,44 @@ def make_model(config, dtype):
     model = config['parameters']['model']
     if model == 'gnn':
         return make_gnn(config, dtype)
-    elif model == 'transformer':
-        return make_transformer(config, dtype)
     elif model == 'dense':
         return make_dense(config, dtype)
     elif model == 'gnn_dense':
         return make_gnn_dense(config, dtype)
     raise KeyError("Unknown model type {}".format(model))
 
+def make_gnn(config, dtype):
+    activation = getattr(tf.nn, config['parameters']['activation'])
+
+    parameters = [
+        'bin_size',
+        'num_convs_id',
+        'num_convs_reg',
+        'num_hidden_id_enc',
+        'num_hidden_id_dec',
+        'num_hidden_reg_enc',
+        'num_hidden_reg_dec',
+        'num_neighbors',
+        'hidden_dim_id',
+        'hidden_dim_reg',
+        'dist_mult',
+        'distance_dim',
+        'dropout',
+        'skip_connection'
+    ]
+    kwargs = {par: config['parameters'][par] for par in parameters}
+
+    model = PFNet(
+        multi_output=config["setup"]["multi_output"],
+        num_input_classes=config["dataset"]["num_input_classes"],
+        num_output_classes=config["dataset"]["num_output_classes"],
+        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
+        activation=activation,
+        **kwargs
+    )
+
+    return model
+
 def make_gnn_dense(config, dtype):
 
     parameters = [
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 164c34571..8156d9683 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -69,6 +69,11 @@ parameters:
   hidden_dim: 512
   bin_size: 3200
   clip_value_low: 0.00
+  conv_config:
+    type: GHConvDense
+    output_dim: 128
+    activation: elu
+    normalize_degrees: yes
   num_conv: 3
   num_gsl: 1
   normalize_degrees: yes

From 2f502dc4c724a9c960b4902674799711409ad13f Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 14:21:11 +0300
Subject: [PATCH 051/157] fix

---
 mlpf/tfmodel/model_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index d8e9384c3..922130987 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -1,4 +1,4 @@
-from .model import DummyNet, PFNetDense
+from .model import DummyNet, PFNet, PFNetDense
 
 import tensorflow as tf
 import tensorflow_probability

From ad0a2885a68a8c0510552f001bab220464453f69 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 14:28:13 +0300
Subject: [PATCH 052/157] readd

---
 mlpf/tfmodel/model_setup.py |  7 ++--
 parameters/test-cms.yaml    | 77 +++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 parameters/test-cms.yaml

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 922130987..a8d16e6ca 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -206,9 +206,10 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
-            plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
-            plt.xlim(minval, maxval)
-            plt.ylim(minval, maxval)
+            if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
+                plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
+                plt.xlim(minval, maxval)
+                plt.ylim(minval, maxval)
 
         plt.xlabel("predicted")
         plt.ylabel("true")
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
new file mode 100644
index 000000000..a6e4f1967
--- /dev/null
+++ b/parameters/test-cms.yaml
@@ -0,0 +1,77 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+  num_input_classes: 12
+  #(none, ch.had, n.had, hfem, hfhad, gamma, e, mu)
+  num_output_classes: 8
+  num_momentum_outputs: 5
+  padded_num_elem_size: 6400
+  classification_loss_coef: 1.0
+  momentum_loss_coef: 1.0
+  charge_loss_coef: 1.0
+  pt_loss_coef: 1.0
+  eta_loss_coef: 1.0
+  sin_phi_loss_coef: 1.0
+  cos_phi_loss_coef: 1.0
+  energy_loss_coef: 0.001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr/*.tfrecords
+  num_files_per_chunk: 5
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  lr: 1e-6
+  batch_size: 5
+  num_events_train: 5
+  num_events_test: 5
+  num_epochs: 1
+  num_val_files: 1
+  dtype: float32
+  trainable: all
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: none
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+  
+parameters:
+  model: gnn
+  bin_size: 64
+  num_convs_id: 1
+  num_convs_reg: 1
+  num_hidden_id_enc: 1
+  num_hidden_id_dec: 0
+  num_hidden_reg_enc: 1
+  num_hidden_reg_dec: 0
+  num_neighbors: 16 
+  hidden_dim_id: 64
+  hidden_dim_reg: 64
+  distance_dim: 64
+  dropout: 0.0
+  dist_mult: 1.0
+  activation: elu
+  skip_connection: True
+
+timing:
+  num_ev: 1
+  num_iter: 1
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes

From 9025f3f9be031e8a595b41cee62556285a39e183 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 14:32:03 +0300
Subject: [PATCH 053/157] add missing import

---
 mlpf/tfmodel/model_setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index a8d16e6ca..8915f8e08 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -20,6 +20,7 @@
 import time
 import json
 import random
+import math
 import platform
 from tqdm import tqdm
 from pathlib import Path

From 4ef7be3d91c909c1d9843bb24c6442222ce98af6 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 14:37:42 +0300
Subject: [PATCH 054/157] revert local test

---
 scripts/local_test_cms_pipeline.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 93b022d84..2f10ec7e2 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -32,12 +32,15 @@ rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
+python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action data
 
 #Run a simple training on a few events
-python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-
+python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
 
-#Generate the predictions
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-*
+#Generate the pred.npz file of predictions
+python3 mlpf/pipeline.py evaluate -c parameters/test-cms.yaml -t ./experiments/test-cms-*
 
-#thest that the frozen graph can be generated and loaded
-python3 scripts/test_load_tfmodel.py ./experiments/test-cms-v2-*/model_frozen/frozen_graph.pb
+python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
+
+python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-v2-
+python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-v2-*

From 5c8ae82c16a6f7c2010a54fe6fc37b0ff7734e93 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 14:50:53 +0300
Subject: [PATCH 055/157] fix

---
 mlpf/tfmodel/model_setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 8915f8e08..49a7f859f 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -359,7 +359,6 @@ def make_gnn_dense(config, dtype):
         "clip_value_low",
         "num_conv",
         "num_gsl",
-        "normalize_degrees",
         "distance_dim",
         "dropout",
         "separate_momentum",

From 3f2ead6702e52a2a39d27545d4c5e26d5c5979bc Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 15:23:00 +0300
Subject: [PATCH 056/157] fix activation

---
 mlpf/tfmodel/model.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index dac165eb9..562ba8a9a 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -207,7 +207,7 @@ def call(self, inputs):
 
 class GHConvDense(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
+        self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
         self.output_dim = kwargs.pop("output_dim")
         self.normalize_degrees = kwargs.pop("normalize_degrees", True)
 
@@ -224,6 +224,9 @@ def build(self, input_shape):
     #@tf.function
     def call(self, inputs):
         x, adj, msk = inputs
+
+        adj = tf.squeeze(adj)
+        
         #compute the normalization of the adjacency matrix
         if self.normalize_degrees:
             in_degrees = tf.clip_by_value(tf.reduce_sum(tf.abs(adj), axis=-1), 0, 1000)
@@ -249,7 +252,7 @@ def __init__(self, *args, **kwargs):
         self.output_dim = kwargs.pop("output_dim")
         self.hidden_dim = kwargs.pop("hidden_dim")
         self.num_layers = kwargs.pop("num_layers")
-        self.activation = kwargs.pop("activation")
+        self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
 
         self.ffn = point_wise_feed_forward_network(self.output_dim, self.hidden_dim, num_layers=self.num_layers, activation=self.activation)
         super(MPNNNodeFunction, self).__init__(*args, **kwargs)
@@ -468,7 +471,7 @@ def call(self, x_dist, x_features, msk):
             dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist)
             dm = tf.keras.activations.elu(dm)
         elif self.kernel == "gaussian":
-            dm = pairwise_gaussian_dist(x_dist_binned, x_dist_binned)
+            dm = tf.expand_dims(pairwise_gaussian_dist(x_dist_binned, x_dist_binned), axis=-1)
             dm = tf.exp(-self.dist_mult*dm)
             dm = tf.clip_by_value(dm, self.clip_value_low, 1)
 
@@ -677,7 +680,7 @@ def set_trainable_regression(self):
         self.gnn_reg.trainable = True
         self.layer_momentum.trainable = True
 
-        
+
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
     

From f8e65f95ffa153b943665235a51e09d2f0691fad Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 15:51:26 +0300
Subject: [PATCH 057/157] revert cms-gnn-dense parameters

---
 mlpf/tfmodel/model.py             | 11 +++++++++--
 parameters/cms-gnn-dense-dev.yaml |  5 +++--
 parameters/cms-gnn-dense.yaml     | 16 ++++++++--------
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 562ba8a9a..76326469c 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -253,14 +253,21 @@ def __init__(self, *args, **kwargs):
         self.hidden_dim = kwargs.pop("hidden_dim")
         self.num_layers = kwargs.pop("num_layers")
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
+        self.aggregation_direction = kwargs.pop("aggregation_direction")
+
+        if self.aggregation_direction == "dst":
+            self.agg_dim = -2
+        elif self.aggregation_direction == "src":
+            self.agg_dim = -3
 
         self.ffn = point_wise_feed_forward_network(self.output_dim, self.hidden_dim, num_layers=self.num_layers, activation=self.activation)
         super(MPNNNodeFunction, self).__init__(*args, **kwargs)
 
     def call(self, inputs):
         x, adj, msk = inputs
-        avg_message = tf.reduce_mean(adj, axis=-2)
-        x2 = tf.concat([x, avg_message], axis=-1)*msk
+        avg_message = tf.reduce_mean(adj, axis=self.agg_dim)
+        max_message = tf.reduce_max(adj, axis=self.agg_dim)
+        x2 = tf.concat([x, avg_message, max_message], axis=-1)*msk
         return self.ffn(x2)
 
 def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None):
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index cf189a927..b11c966f1 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -73,15 +73,16 @@ parameters:
   activation: elu
   layernorm: no
   hidden_dim: 256
-  bin_size: 40
+  bin_size: 32
   clip_value_low: 0.0
   num_conv: 1
   conv_config:
     type: MPNNNodeFunction
-    output_dim: 128
+    output_dim: 256
     hidden_dim: 128
     num_layers: 2
     activation: elu
+    aggregation_direction: dst
   num_gsl: 3
   normalize_degrees: yes
   distance_dim: 16
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 8156d9683..4b6b423a6 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -43,7 +43,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 1
+  batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 500
@@ -66,19 +66,19 @@ parameters:
   model: gnn_dense
   activation: elu
   layernorm: no
-  hidden_dim: 512
-  bin_size: 3200
-  clip_value_low: 0.00
+  hidden_dim: 256
+  bin_size: 640
+  clip_value_low: 0.01
   conv_config:
     type: GHConvDense
     output_dim: 128
     activation: elu
     normalize_degrees: yes
-  num_conv: 3
-  num_gsl: 1
+  num_conv: 2
+  num_gsl: 2
   normalize_degrees: yes
-  distance_dim: 512
-  dropout: 0.0
+  distance_dim: 128
+  dropout: 0.2
   separate_momentum: yes
   input_encoding: cms
   graph_kernel: gaussian #gaussian, learnable

From a737ed183a78b3a43b06f44983154c8964d4ea06 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 19 Aug 2021 21:17:03 +0300
Subject: [PATCH 058/157] use additive regression

---
 mlpf/tfmodel/model.py             | 44 ++++++++++++++++++-------------
 mlpf/tfmodel/model_setup.py       |  4 +--
 mlpf/tfmodel/utils.py             |  2 +-
 notebooks/pfnet-debug.ipynb       | 24 +++++++++++++++--
 parameters/cms-gnn-dense-dev.yaml | 23 ++++++++--------
 5 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 76326469c..19a1a7ffc 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -270,14 +270,20 @@ def call(self, inputs):
         x2 = tf.concat([x, avg_message, max_message], axis=-1)*msk
         return self.ffn(x2)
 
-def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None):
+def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None, dim_decrease=False):
     bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
     kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
-    return tf.keras.Sequential(
-        [tf.keras.layers.Dense(dff, activation=activation, bias_regularizer=bias_regularizer, kernel_regularizer=kernel_regularizer) for i in range(num_layers)] +
-        [tf.keras.layers.Dense(d_model, dtype=dtype)],
-        name=name
-    )
+
+    layers = []
+    for ilayer in range(num_layers):
+        layers.append(tf.keras.layers.Dense(
+            dff, activation=activation, bias_regularizer=bias_regularizer,
+            kernel_regularizer=kernel_regularizer))
+        if dim_decrease:
+            dff = dff // 2
+
+    layers.append(tf.keras.layers.Dense(d_model, dtype=dtype))
+    return tf.keras.Sequential(layers, name=name)
 
 def get_conv_layer(config_dict):
     config_dict = config_dict.copy()
@@ -787,8 +793,6 @@ def __init__(self,
         self.ffn_enc_id = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_id")
         self.ffn_enc_reg = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_reg")
 
-        self.momentum_mult = self.add_weight(shape=(num_momentum_outputs, ), initializer=tf.keras.initializers.Ones(), name="momentum_multiplication")
-
         kwargs_cg = {
             "output_dim": dff,
             "max_num_bins": max_num_bins,
@@ -806,18 +810,18 @@ def __init__(self,
         self.cg_id = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
         self.cg_reg = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
 
-        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=4, activation=activation)
-        self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=2, activation=activation)
+        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True)
+        self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=2, activation=activation, dim_decrease=True)
         
         if self.separate_momentum:
             self.ffn_momentum = [
                 point_wise_feed_forward_network(
                     1, dff, name="ffn_momentum{}".format(imomentum),
-                    dtype=tf.dtypes.float32, num_layers=4, activation=activation
+                    dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True
                 ) for imomentum in range(num_momentum_outputs)
             ]
         else:
-            self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, name="ffn_momentum", dtype=tf.dtypes.float32, num_layers=4, activation=activation)
+            self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, name="ffn_momentum", dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True)
 
     def call(self, inputs, training=False):
         X = inputs
@@ -893,18 +897,20 @@ def call(self, inputs, training=False):
         else:
             pred_momentum = self.ffn_momentum(dec_output_reg)*msk_input
 
-        pred_momentum = self.momentum_mult*pred_momentum
-
         out_charge = tf.clip_by_value(out_charge, -2, 2)
 
+        pred_eta = X[:, :, 2:3]*(1.0 + pred_momentum[:, :, 1:2]/100.0)
+        pred_energy = X[:, :, 4:5]*(1.0 + pred_momentum[:, :, 4:5])
+        pred_pt = pred_momentum[:, :, 0:1] * pred_energy
+
         ret = {
             "cls": out_id_softmax,
             "charge": out_charge,
-            "pt": tf.exp(tf.clip_by_value(pred_momentum[:, :, 0:1], -6, 8)),
-            "eta": pred_momentum[:, :, 1:2],
-            "sin_phi": pred_momentum[:, :, 2:3],
-            "cos_phi": pred_momentum[:, :, 3:4],
-            "energy": tf.exp(tf.clip_by_value(pred_momentum[:, :, 4:5], -6, 8)),
+            "pt": pred_pt,
+            "eta": pred_eta,
+            "sin_phi": tf.math.sin(X[:, :, 3:4])*(1.0 + pred_momentum[:, :, 2:3]/100.0),
+            "cos_phi": tf.math.cos(X[:, :, 3:4])*(1.0 + pred_momentum[:, :, 3:4]/100.0),
+            "energy": pred_energy,
         }
         if self.debug:
             for k in debugging_data.keys():
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 49a7f859f..b7132ffdc 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -209,8 +209,8 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
             maxval = np.max(vals_true)
             if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
                 plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
-                plt.xlim(minval, maxval)
-                plt.ylim(minval, maxval)
+                plt.xlim(0.9*minval, 1.1*maxval)
+                plt.ylim(0.9*minval, 1.1*maxval)
 
         plt.xlabel("predicted")
         plt.ylabel("true")
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 946f72099..fd19e2f67 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -152,7 +152,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 6378de620..730f149c3 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -119,8 +119,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==1\n",
-    "plt.scatter(vals_a[msk], vals_b[msk])"
+    "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==5\n",
+    "plt.scatter(vals_a[msk], vals_b[msk], marker=\".\", alpha=0.2)\n",
+    "plt.xlim(-1,1)\n",
+    "plt.ylim(-1,1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vals_a[msk]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vals_b[msk]"
    ]
   },
   {
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index b11c966f1..22f7610fd 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -26,9 +26,9 @@ dataset:
   classification_loss_coef: 1.0
   charge_loss_coef: 1.0
   pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 100.0
+  cos_phi_loss_coef: 100.0
   energy_loss_coef: 0.1
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
@@ -48,14 +48,14 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
-  batch_size: 5
+  lr: 5e-4
+  batch_size: 10
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 10
   num_val_files: 10
   dtype: float32
-  trainable: all
+  trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
@@ -71,20 +71,19 @@ sample_weights:
 parameters:
   model: gnn_dense  
   activation: elu
-  layernorm: no
+  layernorm: yes
   hidden_dim: 256
   bin_size: 32
   clip_value_low: 0.0
-  num_conv: 1
   conv_config:
     type: MPNNNodeFunction
-    output_dim: 256
+    output_dim: 128
     hidden_dim: 128
-    num_layers: 2
+    num_layers: 3
     activation: elu
     aggregation_direction: dst
-  num_gsl: 3
-  normalize_degrees: yes
+  num_conv: 1
+  num_gsl: 2
   distance_dim: 16
   dropout: 0.1
   separate_momentum: yes

From 2c93578f90f1c552c1f30b86511e04e21a45197b Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 20 Aug 2021 13:38:45 +0300
Subject: [PATCH 059/157] up

---
 mlpf/pipeline.py                  | 12 ++++-----
 mlpf/tfmodel/model.py             | 44 +++++++++++++++++--------------
 mlpf/tfmodel/model_setup.py       |  6 ++---
 parameters/cms-gnn-dense-dev.yaml | 12 ++++-----
 4 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 6cc03090c..ae85b064f 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -248,15 +248,15 @@ def train_reg(config, weights, ntrain, ntest, recreate, prefix):
                 cls_true = tf.argmax(yb["cls"], axis=-1)
                 cls_pred = tf.argmax(res["cls"], axis=-1)
                 msk_x = xb[:, :, 0]!=0
-                msk_correct = msk_x & (cls_true==cls_pred) & (cls_true==3)
+                msk_correct = msk_x & (cls_true==cls_pred)
 
                 msk_correct_f = tf.expand_dims(tf.cast(msk_correct, tf.float32), axis=-1)
 
-                loss_value = tf.keras.losses.mean_squared_error(tf.math.log(yb["energy"]*msk_correct_f + 1.0), tf.math.log(res["energy"]*msk_correct_f + 1.0))
-                #loss_value = loss_value + tf.keras.losses.huber(yb["pt"]*msk_correct_f, res["pt"]*msk_correct_f, delta=5.0)
-                #loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["eta"]*msk_correct_f, res["eta"]*msk_correct_f)
-                #loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["sin_phi"]*msk_correct_f, res["sin_phi"]*msk_correct_f)
-                #loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["cos_phi"]*msk_correct_f, res["cos_phi"]*msk_correct_f)
+                loss_value = tf.keras.losses.huber(yb["energy"]*msk_correct_f, res["energy"]*msk_correct_f, delta=10.0)
+                loss_value = loss_value + tf.keras.losses.huber(yb["pt"]*msk_correct_f, res["pt"]*msk_correct_f, delta=5.0)
+                loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["eta"]*msk_correct_f, res["eta"]*msk_correct_f)
+                loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["sin_phi"]*msk_correct_f, res["sin_phi"]*msk_correct_f)
+                loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["cos_phi"]*msk_correct_f, res["cos_phi"]*msk_correct_f)
                 loss_value = tf.reduce_mean(loss_value)
                 loss_vals.append(loss_value.numpy())
                 #import pdb;pdb.set_trace()
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 19a1a7ffc..b18dc62f6 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -446,7 +446,7 @@ def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_s
         self.kernel = kwargs.pop("kernel")
 
         if self.kernel == "learnable":
-            self.ffn_dist = point_wise_feed_forward_network(32, 32, num_layers=2, activation="elu")
+            self.ffn_node_pair = point_wise_feed_forward_network(32, 32, num_layers=2, activation="elu")
         elif self.kernel == "gaussian":
             pass
 
@@ -481,7 +481,7 @@ def call(self, x_dist, x_features, msk):
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
         if self.kernel == "learnable":
-            dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_dist)
+            dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_node_pair)
             dm = tf.keras.activations.elu(dm)
         elif self.kernel == "gaussian":
             dm = tf.expand_dims(pairwise_gaussian_dist(x_dist_binned, x_dist_binned), axis=-1)
@@ -715,7 +715,7 @@ def __init__(self, *args, **kwargs):
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
 
-        self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, self.distance_dim)
+        self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, 128, num_layers=2, activation="elu")
         self.dist = GraphBuilderDense(clip_value_low=self.clip_value_low, distance_dim=self.distance_dim, max_num_bins=self.max_num_bins , bin_size=self.bin_size, dist_mult=self.dist_mult, kernel=self.kernel)
         self.convs = [
             get_conv_layer(self.conv_config) for iconv in range(self.num_conv)
@@ -731,6 +731,7 @@ def call(self, x, msk, training):
         if self.do_layernorm:
             x = self.layernorm(x)
 
+        #compute node features for graph building
         x_dist = self.ffn_dist(x)
         bins_split, x_binned, dm, msk_binned = self.dist(x_dist, x, msk)
         for conv in self.convs:
@@ -859,9 +860,9 @@ def call(self, inputs, training=False):
             dec_input_cls.append(enc)
         dec_input_cls += encs_id
 
-        graph_sum = tf.reduce_sum(encs_id[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
-        graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
-        dec_input_cls.append(graph_sum)
+        # graph_sum = tf.reduce_sum(encs_id[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
+        # graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
+        # dec_input_cls.append(graph_sum)
 
         dec_output_id = tf.concat(dec_input_cls, axis=-1)*msk_input
         if self.debug:
@@ -880,12 +881,12 @@ def call(self, inputs, training=False):
         if self.skip_connection:
             dec_input_reg.append(enc)
         if self.regression_use_classification:
-            dec_input_reg.append(tf.cast(out_id_logits, X.dtype))
+            dec_input_reg.append(tf.cast(tf.stop_gradient(out_id_logits), X.dtype))
         dec_input_reg += encs_reg
 
-        graph_sum = tf.reduce_sum(encs_reg[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
-        graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
-        dec_input_reg.append(graph_sum)
+        # graph_sum = tf.reduce_sum(encs_reg[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
+        # graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
+        # dec_input_reg.append(graph_sum)
 
         dec_output_reg = tf.concat(dec_input_reg, axis=-1)*msk_input
         if self.debug:
@@ -893,25 +894,28 @@ def call(self, inputs, training=False):
 
         if self.separate_momentum:
             pred_momentum = [ffn(dec_output_reg) for ffn in self.ffn_momentum]
-            pred_momentum = tf.concat(pred_momentum, axis=-1)*msk_input
+            pred_momentum = tf.concat(pred_momentum, axis=-1)
         else:
-            pred_momentum = self.ffn_momentum(dec_output_reg)*msk_input
+            pred_momentum = self.ffn_momentum(dec_output_reg)
 
         out_charge = tf.clip_by_value(out_charge, -2, 2)
 
-        pred_eta = X[:, :, 2:3]*(1.0 + pred_momentum[:, :, 1:2]/100.0)
+        pred_eta = X[:, :, 2:3] + pred_momentum[:, :, 1:2]/100.0
+        pred_sin_phi = tf.math.sin(X[:, :, 3:4]) + pred_momentum[:, :, 2:3]/100.0
+        pred_cos_phi = tf.math.cos(X[:, :, 3:4]) + pred_momentum[:, :, 3:4]/100.0
         pred_energy = X[:, :, 4:5]*(1.0 + pred_momentum[:, :, 4:5])
-        pred_pt = pred_momentum[:, :, 0:1] * pred_energy
+        pred_pt = pred_momentum[:, :, 0:1] * tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
 
         ret = {
             "cls": out_id_softmax,
-            "charge": out_charge,
-            "pt": pred_pt,
-            "eta": pred_eta,
-            "sin_phi": tf.math.sin(X[:, :, 3:4])*(1.0 + pred_momentum[:, :, 2:3]/100.0),
-            "cos_phi": tf.math.cos(X[:, :, 3:4])*(1.0 + pred_momentum[:, :, 3:4]/100.0),
-            "energy": pred_energy,
+            "charge": out_charge*msk_input,
+            "pt": pred_pt*msk_input,
+            "eta": pred_eta*msk_input,
+            "sin_phi": pred_sin_phi*msk_input,
+            "cos_phi": pred_cos_phi*msk_input,
+            "energy": pred_energy*msk_input,
         }
+
         if self.debug:
             for k in debugging_data.keys():
                 ret[k] = debugging_data[k]
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index b7132ffdc..fdcd97309 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -190,7 +190,7 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variabl
     def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False):
 
         if icls==0:
-            sel = self.ytrue_id[msk]!=icls
+            sel = (self.ytrue_id[msk]!=0) & (ypred_id[msk]!=0)
         else:
             sel = (ypred_id[msk]==icls) & (self.ytrue_id[msk]==icls)
 
@@ -209,8 +209,8 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
             maxval = np.max(vals_true)
             if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
                 plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
-                plt.xlim(0.9*minval, 1.1*maxval)
-                plt.ylim(0.9*minval, 1.1*maxval)
+                plt.xlim(minval, maxval)
+                plt.ylim(minval, maxval)
 
         plt.xlabel("predicted")
         plt.ylabel("true")
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 22f7610fd..5ceb866f8 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -48,14 +48,14 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 5e-4
-  batch_size: 10
+  lr: 1e-4
+  batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 10
+  num_epochs: 100
   num_val_files: 10
   dtype: float32
-  trainable: classification
+  trainable:
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
@@ -83,9 +83,9 @@ parameters:
     activation: elu
     aggregation_direction: dst
   num_conv: 1
-  num_gsl: 2
+  num_gsl: 3
   distance_dim: 16
-  dropout: 0.1
+  dropout: 0.0
   separate_momentum: yes
   input_encoding: cms
   graph_kernel: learnable #gaussian, learnable

From 47a50d32250c0c4c7e16321d0e6181affcd741e9 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 20 Aug 2021 22:13:40 +0300
Subject: [PATCH 060/157] moomentum layers explicit

---
 mlpf/pipeline.py                  |   1 -
 mlpf/tfmodel/model.py             | 103 ++++++++++++++++++++----------
 mlpf/tfmodel/model_setup.py       |  12 +++-
 mlpf/tfmodel/utils.py             |   2 +-
 notebooks/pfnet-debug.ipynb       |  10 ++-
 parameters/cms-gnn-dense-dev.yaml |  20 ++++--
 parameters/cms-gnn-dense.yaml     |   1 -
 parameters/test-cms-v2.yaml       |   1 -
 8 files changed, 99 insertions(+), 51 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ae85b064f..974090b80 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -259,7 +259,6 @@ def train_reg(config, weights, ntrain, ntest, recreate, prefix):
                 loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["cos_phi"]*msk_correct_f, res["cos_phi"]*msk_correct_f)
                 loss_value = tf.reduce_mean(loss_value)
                 loss_vals.append(loss_value.numpy())
-                #import pdb;pdb.set_trace()
 
             grads = tape.gradient(loss_value, model.trainable_weights)
             opt.apply_gradients(zip(grads, model.trainable_weights))
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index b18dc62f6..d856de575 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -5,8 +5,6 @@
 
 import tensorflow as tf
 
-from .fast_attention import Attention, SelfAttention
-
 import numpy as np
 from numpy.lib.recfunctions import append_fields
 
@@ -268,7 +266,7 @@ def call(self, inputs):
         avg_message = tf.reduce_mean(adj, axis=self.agg_dim)
         max_message = tf.reduce_max(adj, axis=self.agg_dim)
         x2 = tf.concat([x, avg_message, max_message], axis=-1)*msk
-        return self.ffn(x2)
+        return self.activation(self.ffn(x2))
 
 def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None, dim_decrease=False):
     bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
@@ -814,15 +812,22 @@ def __init__(self,
         self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True)
         self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=2, activation=activation, dim_decrease=True)
         
-        if self.separate_momentum:
-            self.ffn_momentum = [
-                point_wise_feed_forward_network(
-                    1, dff, name="ffn_momentum{}".format(imomentum),
-                    dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True
-                ) for imomentum in range(num_momentum_outputs)
-            ]
-        else:
-            self.ffn_momentum = point_wise_feed_forward_network(num_momentum_outputs, dff, name="ffn_momentum", dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True)
+        self.ffn_pt = point_wise_feed_forward_network(
+            2, dff, name="ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+        )
+        self.ffn_eta = point_wise_feed_forward_network(
+            2, dff, name="ffn_eta",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+        )
+        self.ffn_phi = point_wise_feed_forward_network(
+            4, dff, name="ffn_phi",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+        )
+        self.ffn_energy = point_wise_feed_forward_network(
+            2, dff, name="ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True
+        )
 
     def call(self, inputs, training=False):
         X = inputs
@@ -860,10 +865,6 @@ def call(self, inputs, training=False):
             dec_input_cls.append(enc)
         dec_input_cls += encs_id
 
-        # graph_sum = tf.reduce_sum(encs_id[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
-        # graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
-        # dec_input_cls.append(graph_sum)
-
         dec_output_id = tf.concat(dec_input_cls, axis=-1)*msk_input
         if self.debug:
             debugging_data["dec_output_id"] = dec_output_id
@@ -881,30 +882,37 @@ def call(self, inputs, training=False):
         if self.skip_connection:
             dec_input_reg.append(enc)
         if self.regression_use_classification:
-            dec_input_reg.append(tf.cast(tf.stop_gradient(out_id_logits), X.dtype))
+            dec_input_reg.append(tf.cast(out_id_logits, X.dtype))
         dec_input_reg += encs_reg
 
-        # graph_sum = tf.reduce_sum(encs_reg[-1], axis=-2)/tf.cast(tf.shape(X)[1], X.dtype)
-        # graph_sum = tf.tile(tf.expand_dims(graph_sum, 1), [1, tf.shape(X)[1], 1])
-        # dec_input_reg.append(graph_sum)
-
         dec_output_reg = tf.concat(dec_input_reg, axis=-1)*msk_input
         if self.debug:
             debugging_data["dec_output_reg"] = dec_output_reg
 
-        if self.separate_momentum:
-            pred_momentum = [ffn(dec_output_reg) for ffn in self.ffn_momentum]
-            pred_momentum = tf.concat(pred_momentum, axis=-1)
-        else:
-            pred_momentum = self.ffn_momentum(dec_output_reg)
-
         out_charge = tf.clip_by_value(out_charge, -2, 2)
 
-        pred_eta = X[:, :, 2:3] + pred_momentum[:, :, 1:2]/100.0
-        pred_sin_phi = tf.math.sin(X[:, :, 3:4]) + pred_momentum[:, :, 2:3]/100.0
-        pred_cos_phi = tf.math.cos(X[:, :, 3:4]) + pred_momentum[:, :, 3:4]/100.0
-        pred_energy = X[:, :, 4:5]*(1.0 + pred_momentum[:, :, 4:5])
-        pred_pt = pred_momentum[:, :, 0:1] * tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
+        orig_pt = X[:, :, 1:2]
+        orig_eta = X[:, :, 2:3]
+        orig_sin_phi = tf.math.sin(X[:, :, 3:4])
+        orig_cos_phi = tf.math.cos(X[:, :, 3:4])
+        orig_energy = X[:, :, 4:5]
+
+        pred_eta_corr = self.ffn_eta(dec_output_reg)
+        pred_phi_corr = self.ffn_phi(dec_output_reg)
+        pred_energy_corr = self.ffn_energy(dec_output_reg)
+        pred_pt_corr = self.ffn_pt(dec_output_reg)
+
+        eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
+        sin_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
+        cos_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
+        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+        
+        pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
+        pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
+        pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
+        pred_energy = orig_energy*energy_sigmoid + tf.exp(tf.clip_by_value((1.0 - energy_sigmoid)*pred_energy_corr[:, :, 1:2], -8, 8))
+        pred_pt = orig_pt*pt_sigmoid + tf.exp(tf.clip_by_value((1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2], -8, 8))
 
         ret = {
             "cls": out_id_softmax,
@@ -955,6 +963,37 @@ def set_trainable_named(self, layer_names):
         for layer in layer_names:
             self.get_layer(layer).trainable = True
 
+    # def train_step(self, data):
+    #     # Unpack the data. Its structure depends on your model and
+    #     # on what you pass to `fit()`.
+    #     x, y, sample_weights = data
+
+    #     with tf.GradientTape() as tape:
+    #         y_pred = self(x, training=True)  # Forward pass
+    #         # Compute the loss value
+    #         # (the loss function is configured in `compile()`)
+    #         loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+
+    #     ya = {k: v.numpy() for k, v in y.items()}
+    #     yb = {k: v.numpy() for k, v in y_pred.items()}
+    #     sw = {k: v.numpy() for k, v in sample_weights.items()}
+
+    #     np.savez("ytrue.npz", **ya)
+    #     np.savez("ypred.npz", **yb)
+    #     np.savez("x.npz", x=x)
+    #     np.savez("sample_weights.npz", **sample_weights)
+
+    #     # Compute gradients
+    #     trainable_vars = self.trainable_variables
+    #     gradients = tape.gradient(loss, trainable_vars)
+    #     # Update weights
+    #     self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+    #     # Update metrics (includes the metric that tracks the loss)
+    #     self.compiled_metrics.update_state(y, y_pred)
+    #     # Return a dict mapping metric names to current value
+    #     return {m.name: m.result() for m in self.metrics}
+
+
 class DummyNet(tf.keras.Model):
     def __init__(self,
                 num_input_classes=8,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index fdcd97309..e1a9434a9 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -197,13 +197,20 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
         vals_pred = ypred[reg_variable][msk][sel].flatten()
         vals_true = self.ytrue[reg_variable][msk][sel].flatten()
 
+        #manually as in configuration, later can propagate
+        delta = 0.1
+        if reg_variable == "energy" or reg_variable == "pt":
+            delta = 1.0
+        hub = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
+        hub_loss = hub(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
+
         s = ""
         if log:
             vals_pred = np.log(vals_pred)
             vals_true = np.log(vals_true)
             s = "_log"
 
-        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8)
+        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=(1.0+hub_loss))
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
@@ -214,7 +221,7 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
 
         plt.xlabel("predicted")
         plt.ylabel("true")
-        plt.title(reg_variable)
+        plt.title("{}, HL={:.4f}".format(reg_variable, np.sum(hub_loss)))
         plt.savefig(str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s)), bbox_inches="tight")
         plt.close("all")
 
@@ -361,7 +368,6 @@ def make_gnn_dense(config, dtype):
         "num_gsl",
         "distance_dim",
         "dropout",
-        "separate_momentum",
         "input_encoding",
         "graph_kernel",
         "skip_connection",
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index fd19e2f67..946f72099 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -152,7 +152,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 730f149c3..41d0a9f19 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vals_a = np.sin(X_val[:, :, 3].flatten())"
+    "vals_a = X_val[:, :, 4].flatten()"
    ]
   },
   {
@@ -110,7 +110,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vals_b = ycand_val[\"sin_phi\"][:, :, 0].flatten()"
+    "vals_b = ycand_val[\"energy\"][:, :, 0].flatten()"
    ]
   },
   {
@@ -119,10 +119,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==5\n",
-    "plt.scatter(vals_a[msk], vals_b[msk], marker=\".\", alpha=0.2)\n",
-    "plt.xlim(-1,1)\n",
-    "plt.ylim(-1,1)"
+    "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==4\n",
+    "plt.scatter(vals_a[msk], vals_b[msk], marker=\".\", alpha=0.2)"
    ]
   },
   {
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-gnn-dense-dev.yaml
index 5ceb866f8..472d0e1fb 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-gnn-dense-dev.yaml
@@ -29,17 +29,26 @@ dataset:
   eta_loss_coef: 100.0
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
-  energy_loss_coef: 0.1
+  energy_loss_coef: 1.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
     type: Huber
-    delta: 10.0
+    delta: 1.0
   pt_loss:
     type: Huber
-    delta: 10.0
+    delta: 1.0
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
 
 tensorflow:
   eager: no
@@ -71,13 +80,13 @@ sample_weights:
 parameters:
   model: gnn_dense  
   activation: elu
-  layernorm: yes
+  layernorm: no
   hidden_dim: 256
   bin_size: 32
   clip_value_low: 0.0
   conv_config:
     type: MPNNNodeFunction
-    output_dim: 128
+    output_dim: 256
     hidden_dim: 128
     num_layers: 3
     activation: elu
@@ -86,7 +95,6 @@ parameters:
   num_gsl: 3
   distance_dim: 16
   dropout: 0.0
-  separate_momentum: yes
   input_encoding: cms
   graph_kernel: learnable #gaussian, learnable
   skip_connection: yes
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 4b6b423a6..0649b4cfd 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -79,7 +79,6 @@ parameters:
   normalize_degrees: yes
   distance_dim: 128
   dropout: 0.2
-  separate_momentum: yes
   input_encoding: cms
   graph_kernel: gaussian #gaussian, learnable
   skip_connection: yes
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 8bfb995a5..bf0c6f590 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -65,7 +65,6 @@ parameters:
   num_gsl: 1
   distance_dim: 128
   dropout: 0.0
-  separate_momentum: yes
   input_encoding: cms
   graph_kernel: gaussian #gaussian, learnable
   skip_connection: yes

From b7e9cae19c7c3eff297f062eaa4f083f568114d1 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 16:12:44 +0300
Subject: [PATCH 061/157] big cleanup

---
 .github/workflows/test.yml                    |  26 --
 mlpf/launcher.py                              |  57 ---
 mlpf/pipeline.py                              | 115 ++---
 mlpf/tfmodel/data.py                          |  18 +
 mlpf/tfmodel/model.py                         | 412 ++++++++++--------
 mlpf/tfmodel/model_setup.py                   | 320 +-------------
 mlpf/tfmodel/utils.py                         |   1 +
 .../{cms-gnn-dense-dev.yaml => cms-dev.yaml}  |  28 +-
 parameters/cms-gnn-dense-focal.yaml           |  91 ----
 parameters/cms-gnn-dense-onecycle.yaml        |  97 -----
 parameters/cms-gnn-dense-transfer.yaml        |  76 ----
 parameters/cms-gnn-skipconn-v2.yaml           |  81 ----
 parameters/cms-gnn-skipconn.yaml              |  81 ----
 parameters/cms-transformer-skipconn-gun.yaml  |  74 ----
 parameters/cms-transformer-skipconn.yaml      |  72 ---
 parameters/{cms-gnn-dense.yaml => cms.yaml}   |  50 ++-
 parameters/delphes-gnn-skipconn-onecycle.yaml |  92 ----
 parameters/delphes-gnn-skipconn.yaml          |  79 ----
 parameters/delphes-transformer-skipconn.yaml  |  65 ---
 parameters/delphes.yaml                       | 102 +++++
 parameters/test-cms-v2.yaml                   |  81 ----
 parameters/test-cms.yaml                      |  38 +-
 parameters/test-delphes.yaml                  |  49 ++-
 scripts/local_test_cms_pipeline.sh            |   8 +-
 scripts/local_test_delphes_pipeline.sh        |   2 +-
 25 files changed, 513 insertions(+), 1602 deletions(-)
 delete mode 100644 mlpf/launcher.py
 rename parameters/{cms-gnn-dense-dev.yaml => cms-dev.yaml} (88%)
 delete mode 100644 parameters/cms-gnn-dense-focal.yaml
 delete mode 100644 parameters/cms-gnn-dense-onecycle.yaml
 delete mode 100644 parameters/cms-gnn-dense-transfer.yaml
 delete mode 100644 parameters/cms-gnn-skipconn-v2.yaml
 delete mode 100644 parameters/cms-gnn-skipconn.yaml
 delete mode 100644 parameters/cms-transformer-skipconn-gun.yaml
 delete mode 100644 parameters/cms-transformer-skipconn.yaml
 rename parameters/{cms-gnn-dense.yaml => cms.yaml} (74%)
 delete mode 100644 parameters/delphes-gnn-skipconn-onecycle.yaml
 delete mode 100644 parameters/delphes-gnn-skipconn.yaml
 delete mode 100644 parameters/delphes-transformer-skipconn.yaml
 create mode 100644 parameters/delphes.yaml
 delete mode 100644 parameters/test-cms-v2.yaml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e870ef5d2..ed45c924c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,32 +9,6 @@ on:
   workflow_dispatch:
 
 jobs:
-  delphes-tf:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install python deps
-        run: |
-          sudo apt install python3 python3-pip wget
-          sudo python3 -m pip install --upgrade pip
-          sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
-      - name: Run delphes TF model
-        run: ./scripts/local_test_delphes_tf.sh
-
-  cms-tf:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install python deps
-        run: |
-          sudo apt install python3 python3-pip wget
-          sudo python3 -m pip install --upgrade pip
-          sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
-      - name: Run CMS TF model
-        run: ./scripts/local_test_cms_tf.sh
-
   delphes-pipeline:
     runs-on: ubuntu-latest
     steps:
diff --git a/mlpf/launcher.py b/mlpf/launcher.py
deleted file mode 100644
index 4f2eb0674..000000000
--- a/mlpf/launcher.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import yaml
-import tfmodel
-import tfmodel.model_setup
-
-def load_config(yaml_path):
-    with open(yaml_path) as f:
-        config = yaml.load(f)
-    return config
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-spec", type=str, default="parameters/delphes-gnn-skipconn.yaml", help="the model specification")
-    parser.add_argument("--action", type=str, choices=["data", "train", "eval", "time"], help="Run training, validation or timing", default="train")
-    parser.add_argument("--modifier", type=str, choices=["retrain_energy", None], help="Apply a modification on the standard training", default=None)
-    parser.add_argument("--weights", type=str, help="weight file to load", default=None)
-    parser.add_argument("--ntrain", type=int, help="override the number of training events", default=None)
-    parser.add_argument("--ntest", type=int, help="override the number of testing events", default=None)
-    parser.add_argument("--recreate", action="store_true", help="recreate a new output dir", default=None)
-    parser.add_argument("--raw-path", type=str, help="Override the dataset raw files path", default=None)
-    parser.add_argument("--processed-path", type=str, help="Override the dataset processed files path", default=None)
-    args = parser.parse_args()
-    return args
-
-def apply_modifier_retrain_energy(config):
-    assert(config["parameters"]["model"] == "gnn_dense")
-    config["setup"]["trainable"] = "ffn_momentum4"
-    for loss in [
-        "classification_loss_coef",
-        "charge_loss_coef",
-        "pt_loss_coef",
-        "eta_loss_coef",
-        "sin_phi_loss_coef",
-        "cos_phi_loss_coef"]:
-        config["dataset"][loss] = 0.0
-
-    config["dataset"]["energy_loss_coef"] = 1.0
-    config["setup"]["batch_size"] = 20
-    return config
-
-modifiers = {
-    "retrain_energy": apply_modifier_retrain_energy
-}
-
-if __name__ == "__main__":
-    args = parse_args()
-    yaml_path = args.model_spec
-
-    config = load_config(yaml_path)
-
-    if args.modifier:
-        config = modifiers[args.modifier](config)
-
-    if config["backend"] == "tensorflow":
-        tfmodel.model_setup.main(args, yaml_path, config)
-    elif config["backend"] == "pytorch":
-        pass
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 974090b80..988b592ca 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -57,6 +57,30 @@ def main():
     pass
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+def data(config):
+
+    config, _, _, _, _, _, _ = parse_config(config)
+    cds = config["dataset"]
+
+    dataset_def = Dataset(
+        num_input_features=int(cds["num_input_features"]),
+        num_output_features=int(cds["num_output_features"]),
+        padded_num_elem_size=int(cds["padded_num_elem_size"]),
+        raw_path=cds.get("raw_path"),
+        raw_files=cds.get("raw_files", None),
+        processed_path=cds.get("processed_path"),
+        validation_file_path=cds["validation_file_path"],
+        schema=cds["schema"]
+    )
+
+    dataset_def.process(
+        config["dataset"]["num_files_per_chunk"]
+    )
+        
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
@@ -65,7 +89,8 @@ def main():
 @click.option("--ntest", default=None, help="override the number of testing events", type=int)
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
-def train(config, weights, ntrain, ntest, recreate, prefix):
+@click.option("--plot-freq", default=1, help="Plot detailed validation every N epochs", type=str)
+def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
     """Train a model defined by config"""
     config_file_path = config
     config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
@@ -148,6 +173,8 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         ycand_val,
         dataset_transform,
         config["dataset"]["num_output_classes"],
+        dataset_def,
+        plot_freq
     )
     callbacks.append(optim_callbacks)
 
@@ -180,92 +207,6 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     freeze_model(model, config, outdir)
 
 
-@main.command()
-@click.help_option("-h", "--help")
-@click.option("-c", "--config", help="configuration file", type=click.Path())
-@click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
-@click.option("--ntrain", default=None, help="override the number of training events", type=int)
-@click.option("--ntest", default=None, help="override the number of testing events", type=int)
-@click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
-@click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
-def train_reg(config, weights, ntrain, ntest, recreate, prefix):
-    config_file_path = config
-    config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
-        config, ntrain, ntest, weights
-    )
-
-    if recreate or (weights is None):
-        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
-    else:
-        outdir = str(Path(weights).parent)
-    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
-
-    total_steps = n_epochs * n_train // global_batch_size
-    lr = float(config["setup"]["lr"])
-
-    dataset_def = get_dataset_def(config)
-    ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test, repeat=False)
-    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
-
-    opt = tf.keras.optimizers.Adam(learning_rate=lr)
-
-    if config["setup"]["dtype"] == "float16":
-        model_dtype = tf.dtypes.float16
-        policy = mixed_precision.Policy("mixed_float16")
-        mixed_precision.set_global_policy(policy)
-        opt = mixed_precision.LossScaleOptimizer(opt)
-    else:
-        model_dtype = tf.dtypes.float32
-
-    model = make_model(config, model_dtype)
-    model(tf.cast(X_val[:1], model_dtype))
-
-    initial_epoch = 0
-    if weights:
-        # We need to load the weights in the same trainable configuration as the model was set up
-        configure_model_weights(model, config["setup"].get("weights_config", "all"))
-        model.load_weights(weights, by_name=True)
-        initial_epoch = int(weights.split("/")[-1].split("-")[1])
-    model(tf.cast(X_val[:1], model_dtype))
-
-    callbacks = prepare_callbacks(
-            model,
-            outdir,
-            X_val,
-            ycand_val,
-            dataset_transform,
-            config["dataset"]["num_output_classes"],
-        )
-
-    configure_model_weights(model, "regression")
-    for epoch in range(initial_epoch, initial_epoch+n_epochs):
-
-        loss_vals = []
-        for step, (xb, yb, wb) in tqdm(enumerate(ds_train_r)):
-            with tf.GradientTape() as tape:
-                res = model(xb, training=True)
-
-                cls_true = tf.argmax(yb["cls"], axis=-1)
-                cls_pred = tf.argmax(res["cls"], axis=-1)
-                msk_x = xb[:, :, 0]!=0
-                msk_correct = msk_x & (cls_true==cls_pred)
-
-                msk_correct_f = tf.expand_dims(tf.cast(msk_correct, tf.float32), axis=-1)
-
-                loss_value = tf.keras.losses.huber(yb["energy"]*msk_correct_f, res["energy"]*msk_correct_f, delta=10.0)
-                loss_value = loss_value + tf.keras.losses.huber(yb["pt"]*msk_correct_f, res["pt"]*msk_correct_f, delta=5.0)
-                loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["eta"]*msk_correct_f, res["eta"]*msk_correct_f)
-                loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["sin_phi"]*msk_correct_f, res["sin_phi"]*msk_correct_f)
-                loss_value = loss_value + tf.keras.losses.mean_squared_error(yb["cos_phi"]*msk_correct_f, res["cos_phi"]*msk_correct_f)
-                loss_value = tf.reduce_mean(loss_value)
-                loss_vals.append(loss_value.numpy())
-
-            grads = tape.gradient(loss_value, model.trainable_weights)
-            opt.apply_gradients(zip(grads, model.trainable_weights))
-
-        print(epoch, np.mean(loss_vals))
-        callbacks[-1].on_epoch_end(epoch, logs={})
-
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-t", "--train_dir", required=True, help="directory containing a completed training", type=click.Path())
diff --git a/mlpf/tfmodel/data.py b/mlpf/tfmodel/data.py
index 9cdb56dbc..d89786b8b 100644
--- a/mlpf/tfmodel/data.py
+++ b/mlpf/tfmodel/data.py
@@ -51,10 +51,14 @@ def __init__(self, **kwargs):
         print("val files: {}".format(len(self.val_filelist)))
 
         self.schema = kwargs.get("schema")
+
+        #FIXME: refactor this
         if self.schema == "delphes":
             self.prepare_data = self.prepare_data_delphes
+            self.get_X_eta_phi_energy = self.get_X_eta_phi_energy_delphes
         elif self.schema == "cms":
             self.prepare_data = self.prepare_data_cms
+            self.get_X_eta_phi_energy = self.get_X_eta_phi_energy_cms
 
 #       NONE = 0,
 #       TRACK = 1,
@@ -259,3 +263,17 @@ def process(self, num_files_per_tfr):
         for ichunk, files in enumerate(chunks(self.raw_filelist, num_files_per_tfr)):
             print(files)
             self.serialize_chunk(processed_path, files, ichunk)
+
+    #FIXME: schema 
+    def get_X_eta_phi_energy_delphes(self, X):
+        eta = X[:, :, 2]
+        sphi = X[:, :, 3]
+        cphi = X[:, :, 4]
+        energy = X[:, :, 5]
+        return eta, np.arctan2(sphi, cphi), energy
+
+    def get_X_eta_phi_energy_cms(self, X):
+        eta = X[:, :, 2]
+        phi = X[:, :, 3]
+        energy = X[:, :, 4]
+        return eta, phi, energy
\ No newline at end of file
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index d856de575..daaf181cf 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -244,7 +244,7 @@ def call(self, inputs):
         out = gate*f_hom + (1.0-gate)*f_het
         return self.activation(out)*msk
 
-class MPNNNodeFunction(tf.keras.layers.Layer):
+class NodeMessageLearnable(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
 
         self.output_dim = kwargs.pop("output_dim")
@@ -258,8 +258,8 @@ def __init__(self, *args, **kwargs):
         elif self.aggregation_direction == "src":
             self.agg_dim = -3
 
-        self.ffn = point_wise_feed_forward_network(self.output_dim, self.hidden_dim, num_layers=self.num_layers, activation=self.activation)
-        super(MPNNNodeFunction, self).__init__(*args, **kwargs)
+        self.ffn = point_wise_feed_forward_network(self.output_dim, self.hidden_dim, num_layers=self.num_layers, activation=self.activation, name=kwargs.get("name")+"_ffn")
+        super(NodeMessageLearnable, self).__init__(*args, **kwargs)
 
     def call(self, inputs):
         x, adj, msk = inputs
@@ -268,31 +268,37 @@ def call(self, inputs):
         x2 = tf.concat([x, avg_message, max_message], axis=-1)*msk
         return self.activation(self.ffn(x2))
 
-def point_wise_feed_forward_network(d_model, dff, num_layers=1, activation='elu', dtype=tf.dtypes.float32, name=None, dim_decrease=False):
+def point_wise_feed_forward_network(d_model, dff, name, num_layers=1, activation='elu', dtype=tf.dtypes.float32, dim_decrease=False, dropout=0.0):
     bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
     kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
 
     layers = []
     for ilayer in range(num_layers):
+        _name = name + "_dense_{}".format(ilayer)
+
         layers.append(tf.keras.layers.Dense(
             dff, activation=activation, bias_regularizer=bias_regularizer,
-            kernel_regularizer=kernel_regularizer))
+            kernel_regularizer=kernel_regularizer, name=_name))
+
+        if dropout>0.0:
+            layers.append(tf.keras.layers.Dropout(dropout))
+
         if dim_decrease:
             dff = dff // 2
 
-    layers.append(tf.keras.layers.Dense(d_model, dtype=dtype))
+    layers.append(tf.keras.layers.Dense(d_model, dtype=dtype, name="{}_dense_{}".format(name, ilayer+1)))
     return tf.keras.Sequential(layers, name=name)
 
-def get_conv_layer(config_dict):
+def get_message_layer(config_dict, name):
     config_dict = config_dict.copy()
     class_name = config_dict.pop("type")
     classes = {
-        "MPNNNodeFunction": MPNNNodeFunction,
+        "NodeMessageLearnable": NodeMessageLearnable,
         "GHConvDense": GHConvDense
     }
     conv_cls = classes[class_name]
 
-    return conv_cls(**config_dict)
+    return conv_cls(name=name, **config_dict)
 
 
 class SparseHashedNNDistance(tf.keras.layers.Layer):
@@ -309,8 +315,8 @@ def __init__(self, distance_dim=128, max_num_bins=200, bin_size=500, num_neighbo
         #each bin will receive this many input elements, in total we can accept max_num_bins*bin_size input elements
         #in each bin, we will do a dense top_k evaluation
         self.bin_size = bin_size
-        self.layer_encoding = point_wise_feed_forward_network(distance_dim, 128)
-        self.layer_edge = point_wise_feed_forward_network(1, 128)
+        self.layer_encoding = point_wise_feed_forward_network(distance_dim, 128, "enc")
+        self.layer_edge = point_wise_feed_forward_network(1, 128, "edge")
 
     def build(self, input_shape):
         #(n_batch, n_points, n_features)
@@ -432,24 +438,69 @@ def construct_sparse_dm_batch(self, points):
 
         return bins_split, sparse_distance_matrix
 
-
-class GraphBuilderDense(tf.keras.layers.Layer):
-    def __init__(self, clip_value_low=0.0, distance_dim=128, max_num_bins=200, bin_size=128, dist_mult=0.1, **kwargs):
-        self.dist_mult = dist_mult
-        self.distance_dim = distance_dim
-        self.max_num_bins = max_num_bins
-        self.bin_size = bin_size
+class NodePairGaussianKernel(tf.keras.layers.Layer):
+    def __init__(self, clip_value_low=0.0, dist_mult=0.1, **kwargs):
         self.clip_value_low = clip_value_low
+        self.dist_mult = dist_mult
+        super(NodePairGaussianKernel, self).__init__(**kwargs)
 
-        self.kernel = kwargs.pop("kernel")
+    """
+    x_msg_binned: (n_batch, n_bins, n_points, n_msg_features)
+
+    returns: (n_batch, n_bins, n_points, n_points, 1) message matrix
+    """
+    def call(self, x_msg_binned):
+        dm = tf.expand_dims(pairwise_gaussian_dist(x_msg_binned, x_msg_binned), axis=-1)
+        dm = tf.exp(-self.dist_mult*dm)
+        dm = tf.clip_by_value(dm, self.clip_value_low, 1)
+        return dm
+
+class NodePairTrainableKernel(tf.keras.layers.Layer):
+    def __init__(self, output_dim=32, hidden_dim=32, num_layers=2, activation="elu", **kwargs):
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.activation = getattr(tf.keras.activations, activation)
+
+        self.ffn_kernel = point_wise_feed_forward_network(
+            self.output_dim,
+            self.hidden_dim,
+            kwargs.get("name") + "_" + "ffn",
+            num_layers=self.num_layers,
+            activation=self.activation
+        )
+
+        super(NodePairTrainableKernel, self).__init__(**kwargs)
+
+    """
+    x_msg_binned: (n_batch, n_bins, n_points, n_msg_features)
 
-        if self.kernel == "learnable":
-            self.ffn_node_pair = point_wise_feed_forward_network(32, 32, num_layers=2, activation="elu")
-        elif self.kernel == "gaussian":
-            pass
+    returns: (n_batch, n_bins, n_points, n_points, output_dim) message matrix
+    """
+    def call(self, x_msg_binned):
+        dm = pairwise_learnable_dist(x_msg_binned, x_msg_binned, self.ffn_kernel)
+        dm = self.activation(dm)
+        return dm
+
+def build_kernel_from_conf(kernel_dict, name):
+    kernel_dict = kernel_dict.copy()
+
+    cls_type = kernel_dict.pop("type")
+    clss = {
+        "NodePairGaussianKernel": NodePairGaussianKernel,
+        "NodePairTrainableKernel": NodePairTrainableKernel
+    }
 
-        super(GraphBuilderDense, self).__init__(**kwargs)
+    return clss[cls_type](name=name, **kernel_dict)
 
+class MessageBuildingLayerLSH(tf.keras.layers.Layer):
+    def __init__(self, distance_dim=128, max_num_bins=200, bin_size=128, kernel=NodePairGaussianKernel(), **kwargs):
+        self.distance_dim = distance_dim
+        self.max_num_bins = max_num_bins
+        self.bin_size = bin_size
+        self.kernel = kernel
+
+        super(MessageBuildingLayerLSH, self).__init__(**kwargs)
 
     def build(self, input_shape):
         #(n_batch, n_points, n_features)
@@ -459,34 +510,35 @@ def build(self, input_shape):
             shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal",
             trainable=False, name="lsh_projections"
         )
-        
-    def call(self, x_dist, x_features, msk):
-        msk_f = tf.expand_dims(tf.cast(msk, x_dist.dtype), -1)
-        n_batches = tf.shape(x_dist)[0]
-        n_points = tf.shape(x_dist)[1]
-        n_features = tf.shape(x_dist)[2]
+    
+    """
+    x_msg: (n_batch, n_points, n_msg_features)
+    x_node: (n_batch, n_points, n_node_features)
+    """
+    def call(self, x_msg, x_node, msk):
+        msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
+
+        shp = tf.shape(x_msg)
+        n_batches = shp[0]
+        n_points = shp[1]
+        n_message_features = shp[2]
 
         #compute the number of LSH bins to divide the input points into on the fly
         #n_points must be divisible by bin_size exactly due to the use of reshape
         n_bins = tf.math.floordiv(n_points, self.bin_size)
 
         #put each input item into a bin defined by the argmax output across the LSH embedding
-        mul = tf.linalg.matmul(x_dist, self.codebook_random_rotations[:, :n_bins//2])
+        mul = tf.linalg.matmul(x_msg, self.codebook_random_rotations[:, :n_bins//2])
         cmul = tf.concat([mul, -mul], axis=-1)
         bins_split = split_indices_to_bins_batch(cmul, n_bins, self.bin_size, msk)
-        x_dist_binned = tf.gather(x_dist, bins_split, batch_dims=1)
-        x_features_binned = tf.gather(x_features, bins_split, batch_dims=1)
+        x_msg_binned = tf.gather(x_msg, bins_split, batch_dims=1)
+        x_features_binned = tf.gather(x_node, bins_split, batch_dims=1)
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
-        if self.kernel == "learnable":
-            dm = pairwise_learnable_dist(x_dist_binned, x_dist_binned, self.ffn_node_pair)
-            dm = tf.keras.activations.elu(dm)
-        elif self.kernel == "gaussian":
-            dm = tf.expand_dims(pairwise_gaussian_dist(x_dist_binned, x_dist_binned), axis=-1)
-            dm = tf.exp(-self.dist_mult*dm)
-            dm = tf.clip_by_value(dm, self.clip_value_low, 1)
+        #Run the node-to-node kernel (distance computation / graph building / attention)
+        dm = self.kernel(x_msg_binned)
 
-        #multiply the distance matrix row-wise and column-wise by the mask
+        #remove the masked points row-wise and column-wise
         dm = tf.einsum("abijk,abi->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
         dm = tf.einsum("abijk,abj->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
 
@@ -539,17 +591,108 @@ def call(self, inputs, distance_matrix, training=True):
 
         return x
 
-class AddSparse(tf.keras.layers.Layer):
-    def __init__(self, **kwargs):
-        super(AddSparse, self).__init__(**kwargs)
 
-    def call(self, matrices):
-        ret = matrices[0]
-        for mat in matrices[1:]:
-            ret = tf.sparse.add(ret, mat)
+class OutputDecoding(tf.keras.layers.Layer):
+    def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, **kwargs):
+        super(OutputDecoding, self).__init__(**kwargs)
+
+        self.regression_use_classification = regression_use_classification
+        self.schema = schema
+
+        self.ffn_id = point_wise_feed_forward_network(
+            num_output_classes, hidden_dim,
+            "ffn_cls",
+            dtype=tf.dtypes.float32,
+            num_layers=4,
+            activation=activation,
+            dim_decrease=True
+        )
+        self.ffn_charge = point_wise_feed_forward_network(
+            1, hidden_dim,
+            "ffn_charge",
+            dtype=tf.dtypes.float32,
+            num_layers=2,
+            activation=activation,
+            dim_decrease=True
+        )
+        
+        self.ffn_pt = point_wise_feed_forward_network(
+            2, hidden_dim, "ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+        )
+        self.ffn_eta = point_wise_feed_forward_network(
+            2, hidden_dim, "ffn_eta",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+        )
+        self.ffn_phi = point_wise_feed_forward_network(
+            4, hidden_dim, "ffn_phi",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+        )
+        self.ffn_energy = point_wise_feed_forward_network(
+            2, hidden_dim, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True
+        )
+
+    """
+    X_input: (n_batch, n_elements, n_input_features)
+    X_encoded_id: (n_batch, n_elements, n_encoded_features)
+    X_encoded_reg: (n_batch, n_elements, n_encoded_features)
+    msk_input: (n_batch, n_elements) boolean mask
+    """
+    def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
+
+        out_id_logits = self.ffn_id(X_encoded_id)*msk_input
+        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
+        out_charge = self.ffn_charge(X_encoded_id)*msk_input
+
+        orig_pt = X_input[:, :, 1:2]
+        orig_eta = X_input[:, :, 2:3]
+
+        #FIXME: schema 
+        if self.schema == "cms":
+            orig_sin_phi = tf.math.sin(X_input[:, :, 3:4])
+            orig_cos_phi = tf.math.cos(X_input[:, :, 3:4])
+            orig_energy = X_input[:, :, 4:5]
+        elif self.schema == "delphes":
+            orig_sin_phi = X_input[:, :, 3:4]
+            orig_cos_phi = X_input[:, :, 4:5]
+            orig_energy = X_input[:, :, 5:6]
+
+        if self.regression_use_classification:
+            X_encoded_reg = tf.concat([X_encoded_reg, out_id_logits], axis=-1)
+
+        pred_eta_corr = self.ffn_eta(X_encoded_reg)
+        pred_phi_corr = self.ffn_phi(X_encoded_reg)
+        pred_energy_corr = self.ffn_energy(X_encoded_reg)
+        pred_pt_corr = self.ffn_pt(X_encoded_reg)
+
+        eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
+        sin_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
+        cos_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
+        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+        
+        pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
+        pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
+        pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
+        pred_energy = orig_energy*energy_sigmoid + tf.exp(tf.clip_by_value((1.0 - energy_sigmoid)*pred_energy_corr[:, :, 1:2], -8, 8))
+        pred_pt = orig_pt*pt_sigmoid + tf.exp(tf.clip_by_value((1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2], -8, 8))
+
+        ret = {
+            "cls": out_id_softmax,
+            "charge": out_charge*msk_input,
+            "pt": pred_pt*msk_input,
+            "eta": pred_eta*msk_input,
+            "sin_phi": pred_sin_phi*msk_input,
+            "cos_phi": pred_cos_phi*msk_input,
+            "energy": pred_energy*msk_input,
+        }
+
         return ret
 
+
 #Simple message passing based on a matrix multiplication
+#LEGACY, for Delphes paper compatibility!
 class PFNet(tf.keras.Model):
     def __init__(self,
         multi_output=False,
@@ -563,8 +706,8 @@ def __init__(self,
         convlayer="ghconv",
         dropout=0.1,
         bin_size=10,
-        num_convs_id=1,
-        num_convs_reg=1,
+        num_node_messagess_id=1,
+        num_node_messagess_reg=1,
         num_hidden_id_enc=1,
         num_hidden_id_dec=1,
         num_hidden_reg_enc=1,
@@ -608,26 +751,22 @@ def __init__(self,
         convs_id = []
         convs_reg = []
         if convlayer == "sgconv":
-            for iconv in range(num_convs_id):
+            for iconv in range(num_node_messagess_id):
                 convs_id.append(SGConv(k=1, activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_convs_reg):
+            for iconv in range(num_node_messagess_reg):
                 convs_reg.append(SGConv(k=1, activation=activation, name="conv_reg{}".format(iconv)))
         elif convlayer == "ghconv":
-            for iconv in range(num_convs_id):
+            for iconv in range(num_node_messagess_id):
                 convs_id.append(GHConv(activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_convs_reg):
+            for iconv in range(num_node_messagess_reg):
                 convs_reg.append(GHConv(activation=activation, name="conv_reg{}".format(iconv)))
 
         self.gnn_id = EncoderDecoderGNN(encoding_id, decoding_id, dropout, activation, convs_id, name="gnn_id")
-        self.layer_id = point_wise_feed_forward_network(num_output_classes, hidden_dim_id, num_layers=3, activation=activation)
-        self.layer_charge = point_wise_feed_forward_network(1, hidden_dim_id, num_layers=3, activation=activation)
+        self.layer_id = point_wise_feed_forward_network(num_output_classes, hidden_dim_id, "id", num_layers=3, activation=activation)
+        self.layer_charge = point_wise_feed_forward_network(1, hidden_dim_id, "charge", num_layers=3, activation=activation)
         
         self.gnn_reg = EncoderDecoderGNN(encoding_reg, decoding_reg, dropout, activation, convs_reg, name="gnn_reg")
-        self.layer_momentum = point_wise_feed_forward_network(num_momentum_outputs, hidden_dim_reg, num_layers=3, activation=activation)
-
-    # def create_model(self, num_max_elems, num_input_features, training=True):
-    #     inputs = tf.keras.Input(shape=(num_max_elems, num_input_features,))
-    #     return tf.keras.Model(inputs=[inputs], outputs=self.call(inputs, training), name="MLPFNet")
+        self.layer_momentum = point_wise_feed_forward_network(num_momentum_outputs, hidden_dim_reg, "momentum", num_layers=3, activation=activation)
 
     def call(self, inputs, training=True):
         X = inputs
@@ -691,32 +830,31 @@ def set_trainable_regression(self):
         self.gnn_reg.trainable = True
         self.layer_momentum.trainable = True
 
-
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
     
         self.max_num_bins = kwargs.pop("max_num_bins")
         self.bin_size = kwargs.pop("bin_size")
-        self.dist_mult = kwargs.pop("dist_mult")
-
         self.distance_dim = kwargs.pop("distance_dim")
-        self.output_dim = kwargs.pop("output_dim")
-        
         self.do_layernorm = kwargs.pop("layernorm")
-        self.clip_value_low = kwargs.pop("clip_value_low")
-        self.num_conv = kwargs.pop("num_conv")
-        self.normalize_degrees = kwargs.pop("normalize_degrees")
+        self.num_node_messages = kwargs.pop("num_node_messages")
         self.dropout = kwargs.pop("dropout")
         self.kernel = kwargs.pop("kernel")
-        self.conv_config = kwargs.pop("conv_config")
+        self.node_message = kwargs.pop("node_message")
+        self.hidden_dim = kwargs.pop("hidden_dim")
 
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
 
-        self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, 128, num_layers=2, activation="elu")
-        self.dist = GraphBuilderDense(clip_value_low=self.clip_value_low, distance_dim=self.distance_dim, max_num_bins=self.max_num_bins , bin_size=self.bin_size, dist_mult=self.dist_mult, kernel=self.kernel)
-        self.convs = [
-            get_conv_layer(self.conv_config) for iconv in range(self.num_conv)
+        self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, self.hidden_dim, kwargs.get("name") + "_ffn_dist", num_layers=2, activation="elu", dropout=self.dropout)
+        self.message_building_layer = MessageBuildingLayerLSH(
+            distance_dim=self.distance_dim,
+            max_num_bins=self.max_num_bins,
+            bin_size=self.bin_size,
+            kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+        )
+        self.message_passing_layers = [
+            get_message_layer(self.node_message, "{}_msg_{}".format(kwargs.get("name"), iconv)) for iconv in range(self.num_node_messages)
         ]
         self.dropout_layer = None
         if self.dropout:
@@ -731,9 +869,13 @@ def call(self, x, msk, training):
 
         #compute node features for graph building
         x_dist = self.ffn_dist(x)
-        bins_split, x_binned, dm, msk_binned = self.dist(x_dist, x, msk)
-        for conv in self.convs:
-            x_binned = conv((x_binned, dm, msk_binned))
+
+        #compute the element-to-element messages / distance matrix / graph structure
+        bins_split, x_binned, dm, msk_binned = self.message_building_layer(x_dist, x, msk)
+
+        #run the node update with message passing
+        for msg in self.message_passing_layers:
+            x_binned = msg((x_binned, dm, msk_binned))
             if self.dropout_layer:
                 x_binned = self.dropout_layer(x_binned, training)
 
@@ -746,88 +888,59 @@ def __init__(self,
             multi_output=False,
             num_input_classes=8,
             num_output_classes=3,
-            num_momentum_outputs=3,
             max_num_bins=200,
             bin_size=320,
-            dist_mult=0.1,
             distance_dim=128,
             hidden_dim=256,
             layernorm=False,
-            clip_value_low=0.0,
             activation=tf.keras.activations.elu,
-            num_conv=2,
-            num_gsl=1,
-            normalize_degrees=False,
+            num_node_messages=2,
+            num_graph_layers=1,
             dropout=0.0,
-            separate_momentum=True,
             input_encoding="cms",
             focal_loss_from_logits=False,
-            graph_kernel="gaussian",
-            skip_connection=False,
+            graph_kernel={"type": "NodePairGaussianKernel"},
+            skip_connection=True,
             regression_use_classification=True,
-            conv_config={"type": "GHConvDense", "activation": "elu", "output_dim": 128, "normalize_degrees": True},
-            debug=False
+            node_message={"type": "GHConvDense", "activation": "elu", "output_dim": 128, "normalize_degrees": True},
+            debug=False,
+            schema="cms"
         ):
         super(PFNetDense, self).__init__()
 
         self.multi_output = multi_output
-        self.num_momentum_outputs = num_momentum_outputs
         self.activation = activation
-        self.separate_momentum = separate_momentum
         self.focal_loss_from_logits = focal_loss_from_logits
         self.debug = debug
 
         self.skip_connection = skip_connection
-        self.regression_use_classification = regression_use_classification
 
-        self.num_conv = num_conv
-        self.num_gsl = num_gsl
+        self.num_node_messages = num_node_messages
+        self.num_graph_layers = num_graph_layers
 
         if input_encoding == "cms":
             self.enc = InputEncodingCMS(num_input_classes)
         elif input_encoding == "default":
             self.enc = InputEncoding(num_input_classes)
 
-        dff = hidden_dim
-        self.ffn_enc_id = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_id")
-        self.ffn_enc_reg = point_wise_feed_forward_network(dff, dff, activation=activation, name="ffn_enc_reg")
+        self.ffn_enc_id = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_id", activation=activation)
+        self.ffn_enc_reg = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_reg", activation=activation)
 
         kwargs_cg = {
-            "output_dim": dff,
             "max_num_bins": max_num_bins,
             "bin_size": bin_size,
-            "dist_mult": dist_mult,
             "distance_dim": distance_dim,
             "layernorm": layernorm,
-            "clip_value_low": clip_value_low,
-            "num_conv": num_conv,
-            "normalize_degrees": normalize_degrees,
+            "num_node_messages": num_node_messages,
             "dropout": dropout,
             "kernel": graph_kernel,
-            "conv_config": conv_config
+            "node_message": node_message,
+            "hidden_dim": hidden_dim
         }
-        self.cg_id = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
-        self.cg_reg = [CombinedGraphLayer(**kwargs_cg) for i in range(num_gsl)]
+        self.cg_id = [CombinedGraphLayer(name="cg_id_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
+        self.cg_reg = [CombinedGraphLayer(name="cg_reg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
 
-        self.ffn_id = point_wise_feed_forward_network(num_output_classes, dff, name="ffn_cls", dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True)
-        self.ffn_charge = point_wise_feed_forward_network(1, dff, name="ffn_charge", dtype=tf.dtypes.float32, num_layers=2, activation=activation, dim_decrease=True)
-        
-        self.ffn_pt = point_wise_feed_forward_network(
-            2, dff, name="ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
-        )
-        self.ffn_eta = point_wise_feed_forward_network(
-            2, dff, name="ffn_eta",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
-        )
-        self.ffn_phi = point_wise_feed_forward_network(
-            4, dff, name="ffn_phi",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
-        )
-        self.ffn_energy = point_wise_feed_forward_network(
-            2, dff, name="ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True
-        )
+        self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema)
 
     def call(self, inputs, training=False):
         X = inputs
@@ -864,65 +977,19 @@ def call(self, inputs, training=False):
         if self.skip_connection:
             dec_input_cls.append(enc)
         dec_input_cls += encs_id
-
         dec_output_id = tf.concat(dec_input_cls, axis=-1)*msk_input
         if self.debug:
             debugging_data["dec_output_id"] = dec_output_id
 
-        out_id_logits = self.ffn_id(dec_output_id)*msk_input
-
-        if self.focal_loss_from_logits:
-            out_id_softmax = out_id_logits
-        else:
-            out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-
-        out_charge = self.ffn_charge(dec_output_id)*msk_input
-
         dec_input_reg = []
         if self.skip_connection:
             dec_input_reg.append(enc)
-        if self.regression_use_classification:
-            dec_input_reg.append(tf.cast(out_id_logits, X.dtype))
         dec_input_reg += encs_reg
-
         dec_output_reg = tf.concat(dec_input_reg, axis=-1)*msk_input
         if self.debug:
             debugging_data["dec_output_reg"] = dec_output_reg
 
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
-
-        orig_pt = X[:, :, 1:2]
-        orig_eta = X[:, :, 2:3]
-        orig_sin_phi = tf.math.sin(X[:, :, 3:4])
-        orig_cos_phi = tf.math.cos(X[:, :, 3:4])
-        orig_energy = X[:, :, 4:5]
-
-        pred_eta_corr = self.ffn_eta(dec_output_reg)
-        pred_phi_corr = self.ffn_phi(dec_output_reg)
-        pred_energy_corr = self.ffn_energy(dec_output_reg)
-        pred_pt_corr = self.ffn_pt(dec_output_reg)
-
-        eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
-        sin_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
-        cos_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
-        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        
-        pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
-        pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
-        pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
-        pred_energy = orig_energy*energy_sigmoid + tf.exp(tf.clip_by_value((1.0 - energy_sigmoid)*pred_energy_corr[:, :, 1:2], -8, 8))
-        pred_pt = orig_pt*pt_sigmoid + tf.exp(tf.clip_by_value((1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2], -8, 8))
-
-        ret = {
-            "cls": out_id_softmax,
-            "charge": out_charge*msk_input,
-            "pt": pred_pt*msk_input,
-            "eta": pred_eta*msk_input,
-            "sin_phi": pred_sin_phi*msk_input,
-            "cos_phi": pred_cos_phi*msk_input,
-            "energy": pred_energy*msk_input,
-        }
+        ret = self.output_dec(X, dec_output_id, dec_output_reg, msk_input)
 
         if self.debug:
             for k in debugging_data.keys():
@@ -941,7 +1008,10 @@ def set_trainable_classification(self):
         self.ffn_enc_reg.trainable = False
         for cg in self.cg_reg:
             cg.trainable = False
-        self.ffn_momentum.trainable = False
+        self.ffn_pt.trainable = False
+        self.ffn_eta.trainable = False
+        self.ffn_phi.trainable = False
+        self.ffn_energy.trainable = False
 
     def set_trainable_regression(self):
         self.trainable = True
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index e1a9434a9..9f65f0b9d 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -59,11 +59,13 @@ def plot_to_image(figure):
     return image
 
 class CustomCallback(tf.keras.callbacks.Callback):
-    def __init__(self, outpath, X, y, dataset_transform, num_output_classes, freq=1):
+    def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_classes, plot_freq=1):
         super(CustomCallback, self).__init__()
         self.X = X
         self.y = y
-        self.freq = freq
+        self.plot_freq = plot_freq
+
+        self.dataset_def = dataset_def
 
         #transform the prediction target from an array into a dictionary for easier access
         self.ytrue = dataset_transform(self.X, self.y, None)[1]
@@ -120,14 +122,16 @@ def plot_cm(self, outpath, ypred_id, msk):
 
     def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
 
+        X_eta, X_phi, X_energy = self.dataset_def.get_X_eta_phi_energy(self.X)
+
         fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(3*5, 5))
 
         #Plot the input PFElements
         plt.axes(ax1)
         msk = self.X[ievent, :, 0] != 0
-        eta = self.X[ievent][msk][:, 2]
-        phi = self.X[ievent][msk][:, 3]
-        energy = self.X[ievent][msk][:, 4]
+        eta = X_eta[ievent][msk]
+        phi = X_phi[ievent][msk]
+        energy = X_energy[ievent][msk]
         typ = self.X[ievent][msk][:, 0]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in typ], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
@@ -227,7 +231,7 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
 
     def on_epoch_end(self, epoch, logs=None):
 
-        if epoch%self.freq!=0:
+        if epoch%self.plot_freq!=0:
             return
 
         #save the training logs (losses) for this epoch
@@ -237,7 +241,7 @@ def on_epoch_end(self, epoch, logs=None):
         cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)
         cp_dir.mkdir(parents=True, exist_ok=True)
 
-        #run the model inference on the small validation dataset
+        #run the model inference on the validation dataset
         ypred = self.model.predict(self.X, batch_size=1)
 
         #choose the class with the highest probability as the prediction
@@ -261,7 +265,7 @@ def on_epoch_end(self, epoch, logs=None):
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
-def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes):
+def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes, dataset_def, plot_freq=1):
     callbacks = []
     tb = CustomTensorBoard(
         log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
@@ -288,7 +292,7 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     history_path = Path(outdir) / "history"
     history_path.mkdir(parents=True, exist_ok=True)
     history_path = str(history_path)
-    cb = CustomCallback(history_path, X_val, y_val, dataset_transform, num_output_classes)
+    cb = CustomCallback(dataset_def, history_path, X_val, y_val, dataset_transform, num_output_classes, plot_freq=plot_freq)
     cb.set_model(model)
 
     callbacks += [cb]
@@ -330,8 +334,8 @@ def make_gnn(config, dtype):
 
     parameters = [
         'bin_size',
-        'num_convs_id',
-        'num_convs_reg',
+        'num_node_messagess_id',
+        'num_node_messagess_reg',
         'num_hidden_id_enc',
         'num_hidden_id_dec',
         'num_hidden_reg_enc',
@@ -363,16 +367,15 @@ def make_gnn_dense(config, dtype):
         "layernorm",
         "hidden_dim",
         "bin_size",
-        "clip_value_low",
-        "num_conv",
-        "num_gsl",
+        "num_node_messages",
+        "num_graph_layers",
         "distance_dim",
         "dropout",
         "input_encoding",
         "graph_kernel",
         "skip_connection",
         "regression_use_classification",
-        "conv_config",
+        "node_message",
         "debug"
     ]
 
@@ -382,7 +385,7 @@ def make_gnn_dense(config, dtype):
         multi_output=config["setup"]["multi_output"],
         num_input_classes=config["dataset"]["num_input_classes"],
         num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
+        schema=config["dataset"]["schema"],
         **kwargs
     )
 
@@ -392,7 +395,6 @@ def make_dense(config, dtype):
     model = DummyNet(
         num_input_classes=config["dataset"]["num_input_classes"],
         num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
     )
     return model
 
@@ -533,286 +535,4 @@ def loss(x,y):
             gamma=float(config["setup"].get("focal_loss_gamma", 2.0)),
             from_logits=bool(config["setup"].get("focal_loss_from_logits", False))
         )
-    return loss
-
-def main(args, yaml_path, config):
-    #tf.debugging.enable_check_numerics()
-
-    #Switch off multi-output for the evaluation for backwards compatibility
-    multi_output = True
-    if args.action == "eval":
-        multi_output = False
-
-    tf.config.run_functions_eagerly(config['tensorflow']['eager'])
-
-    from tfmodel.data import Dataset
-    cds = config["dataset"]
-
-    raw_path = cds.get("raw_path", None)
-    if args.raw_path:
-        raw_path = args.raw_path
-
-    processed_path = cds.get("processed_path", None)
-    if args.processed_path:
-        processed_path = args.processed_path
-
-    dataset_def = Dataset(
-        num_input_features=int(cds["num_input_features"]),
-        num_output_features=int(cds["num_output_features"]),
-        padded_num_elem_size=int(cds["padded_num_elem_size"]),
-        raw_path=raw_path,
-        raw_files=cds.get("raw_files", None),
-        processed_path=processed_path,
-        validation_file_path=cds["validation_file_path"],
-        schema=cds["schema"]
-    )
-
-    if args.action == "data":
-        dataset_def.process(
-            config["dataset"]["num_files_per_chunk"]
-        )
-        return
-
-    global_batch_size = config['setup']['batch_size']
-    config['setup']['multi_output'] = multi_output
-
-    model_name = os.path.splitext(os.path.basename(yaml_path))[0] + "-" + str(uuid.uuid4())[:8] + "." + platform.node()
-    print("model_name=", model_name)
-
-    tfr_files = sorted(glob.glob(dataset_def.processed_path))
-    if len(tfr_files) == 0:
-        raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
-
-    random.shuffle(tfr_files)
-    dataset = tf.data.TFRecordDataset(tfr_files).map(dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-    num_events = 0
-    for i in dataset:
-        num_events += 1
-    print("dataset loaded, len={}".format(num_events))
-
-    n_train = config['setup']['num_events_train']
-    n_test = config['setup']['num_events_test']
-
-    if args.ntrain:
-        n_train = args.ntrain
-    if args.ntest:
-        n_test = args.ntest
-
-    n_epochs = config['setup']['num_epochs']
-    weight_func = make_weight_function(config)
-    assert(n_train + n_test <= num_events)
-
-    ps = (
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
-        {
-            "cls": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "charge": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "energy": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "pt": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "eta": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "sin_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "cos_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-        }
-    )
-
-    ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-
-    dataset_transform = None
-    if multi_output:
-        dataset_transform = targets_multi_output(config['dataset']['num_output_classes'])
-        ds_train = ds_train.map(dataset_transform)
-        ds_test = ds_test.map(dataset_transform)
-
-    ds_train_r = ds_train.repeat(n_epochs)
-    ds_test_r = ds_test.repeat(n_epochs)
-
-    #small test dataset used in the callback for making monitoring plots
-    #X_test = np.concatenate(list(ds_test.take(100).map(lambda x,y,w: x).as_numpy_iterator()))
-    #y_test = np.concatenate(list(ds_test.take(100).map(lambda x,y,w: tf.concat(y, axis=-1)).as_numpy_iterator()))
-
-    weights = config['setup']['weights']
-    if args.weights:
-        weights = args.weights
-
-    if args.recreate or (weights is None):
-        outdir = 'experiments/{}'.format(model_name)
-        if os.path.isdir(outdir):
-            print("Output directory exists: {}".format(outdir), file=sys.stderr)
-            sys.exit(1)
-    else:
-        outdir = str(Path(weights).parent.parent)
-
-    try:
-        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
-        num_gpus = len(gpus)
-        print("num_gpus=", num_gpus)
-        if num_gpus > 1:
-            strategy = tf.distribute.MirroredStrategy()
-            global_batch_size = num_gpus * global_batch_size
-        else:
-            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
-    except Exception as e:
-        print("fallback to CPU", e)
-        strategy = tf.distribute.OneDeviceStrategy("cpu")
-        num_gpus = 0
-    
-    Xs = []
-    ygens = []
-    ycands = []
-    #for faster loading        
-    if args.action == "train":
-        val_filelist = dataset_def.val_filelist[:1]
-    else:
-        val_filelist = dataset_def.val_filelist
-        if config['setup']['num_val_files']>0:
-            val_filelist = val_filelist[:config['setup']['num_val_files']]
-
-    for fi in val_filelist:
-        X, ygen, ycand = dataset_def.prepare_data(fi)
-
-        Xs.append(np.concatenate(X))
-        ygens.append(np.concatenate(ygen))
-        ycands.append(np.concatenate(ycand))
-
-    assert(len(Xs) > 0)
-    X_val = np.concatenate(Xs)
-    ygen_val = np.concatenate(ygens)
-    ycand_val = np.concatenate(ycands)
-
-    lr = float(config['setup']['lr'])
-    with strategy.scope():
-        total_steps = n_epochs * n_train // global_batch_size
-        lr_schedule, optim_callbacks = get_lr_schedule(config, lr, steps=total_steps)
-        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
-        if config['setup']['dtype'] == 'float16':
-            model_dtype = tf.dtypes.float16
-            from tensorflow.keras import mixed_precision
-            policy = mixed_precision.Policy('mixed_float16')
-            mixed_precision.set_global_policy(policy)
-            opt = mixed_precision.LossScaleOptimizer(opt)
-        else:
-            model_dtype = tf.dtypes.float32
-
-        if args.action=="train" or args.action=="eval":
-            model = make_model(config, model_dtype)
-
-            #Evaluate model once to build the layers
-            print(X_val.shape)
-            model(tf.cast(X_val[:1], model_dtype))
-
-            initial_epoch = 0
-            if weights:
-                #need to load the weights in the same trainable configuration as the model was set up
-                configure_model_weights(model, config["setup"].get("weights_config", "all"))
-                model.load_weights(weights, by_name=True)
-                initial_epoch = int(weights.split("/")[-1].split("-")[1])
-            model(tf.cast(X_val[:1], model_dtype))
-
-            if config["setup"]["trainable"] == "classification":
-                config["dataset"]["pt_loss_coef"] = 0.0
-                config["dataset"]["eta_loss_coef"] = 0.0
-                config["dataset"]["sin_phi_loss_coef"] = 0.0
-                config["dataset"]["cos_phi_loss_coef"] = 0.0
-                config["dataset"]["energy_loss_coef"] = 0.0
-            elif config["setup"]["trainable"] == "regression":
-                config["dataset"]["classification_loss_coef"] = 0.0
-                config["dataset"]["charge_loss_coef"] = 0.0
-
-            #now set the desirable layers as trainable for the optimization
-            configure_model_weights(model, config["setup"]["trainable"])
-            model(tf.cast(X_val[:1], model_dtype))
-
-            if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
-                cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
-            elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
-                cls_loss = make_focal_loss(config)
-            else:
-                raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
-            
-            model.compile(
-                loss={
-                    "cls": cls_loss,
-                    "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
-                    "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
-                    "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
-                    "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
-                    "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
-                    "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
-                },
-                optimizer=opt,
-                sample_weight_mode='temporal',
-                loss_weights={
-                    "cls": config["dataset"]["classification_loss_coef"],
-                    "charge": config["dataset"]["charge_loss_coef"],
-                    "pt": config["dataset"]["pt_loss_coef"],
-                    "eta": config["dataset"]["eta_loss_coef"],
-                    "sin_phi": config["dataset"]["sin_phi_loss_coef"],
-                    "cos_phi": config["dataset"]["cos_phi_loss_coef"],
-                    "energy": config["dataset"]["energy_loss_coef"],
-                },
-                metrics={
-                    "cls": [
-                        FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
-                        FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
-                    ] + [
-                        SingleClassRecall(
-                            icls,
-                            name="rec_cls{}".format(icls),
-                            dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"])
-                    ]
-                }
-            )
-            model.summary()
-            
-            if args.action=="train":
-                #file_writer_cm = tf.summary.create_file_writer(outdir + '/val_extra')
-                callbacks = prepare_callbacks(
-                    model, outdir, X_val, ycand_val,
-                    dataset_transform, config["dataset"]["num_output_classes"]
-                )
-                callbacks.append(optim_callbacks)
-
-                fit_result = model.fit(
-                    ds_train_r, validation_data=ds_test_r, epochs=initial_epoch+n_epochs, callbacks=callbacks,
-                    steps_per_epoch=n_train//global_batch_size, validation_steps=n_test//global_batch_size,
-                    initial_epoch=initial_epoch
-                )
-                history_path = Path(outdir) / "history"
-                history_path = str(history_path)
-                with open("{}/history.json".format(history_path), "w") as fi:
-                    json.dump(fit_result.history, fi)
-                model.save(outdir + "/model_full", save_format="tf")
-            
-            if args.action=="eval":
-                eval_model(X_val, ygen_val, ycand_val, model, config, outdir, global_batch_size)
-                freeze_model(model, config, outdir)
-
-        if args.action=="time":
-            synthetic_timing_data = []
-            for iteration in range(config["timing"]["num_iter"]):
-                numev = config["timing"]["num_ev"]
-                for evsize in [128*10, 128*20, 128*30, 128*40, 128*50, 128*60, 128*70, 128*80, 128*90, 128*100]:
-                    for batch_size in [1,2,3,4]:
-                        x = np.random.randn(batch_size, evsize, config["dataset"]["num_input_features"]).astype(np.float32)
-
-                        model = make_model(config, model_dtype)
-                        model(x)
-
-                        if weights:
-                            model.load_weights(weights)
-
-                        t0 = time.time()
-                        for i in range(numev//batch_size):
-                            model(x)
-                        t1 = time.time()
-                        dt = t1 - t0
-
-                        time_per_event = 1000.0*(dt / numev)
-                        synthetic_timing_data.append(
-                                [{"iteration": iteration, "batch_size": batch_size, "event_size": evsize, "time_per_event": time_per_event}])
-                        print("Synthetic random data: batch_size={} event_size={}, time={:.2f} ms/ev".format(batch_size, evsize, time_per_event))
-            with open("{}/synthetic_timing.json".format(outdir), "w") as fi:
-                json.dump(synthetic_timing_data, fi)
+    return loss
\ No newline at end of file
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 946f72099..65892dd13 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -56,6 +56,7 @@ def create_experiment_dir(prefix=None, suffix=None):
         train_dir = train_dir.with_name(train_dir.name + "." + platform.node())
 
     train_dir.mkdir(parents=True)
+    print("Creating experiment dir {}".format(train_dir))
     return str(train_dir)
 
 
diff --git a/parameters/cms-gnn-dense-dev.yaml b/parameters/cms-dev.yaml
similarity index 88%
rename from parameters/cms-gnn-dense-dev.yaml
rename to parameters/cms-dev.yaml
index 472d0e1fb..5c14c0f94 100644
--- a/parameters/cms-gnn-dense-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -61,8 +61,8 @@ setup:
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 100
-  num_val_files: 10
+  num_epochs: 10
+  num_val_files: 1
   dtype: float32
   trainable:
   classification_loss_type: categorical_cross_entropy
@@ -78,25 +78,29 @@ sample_weights:
   energy: signal_only
 
 parameters:
-  model: gnn_dense  
+  model: gnn_dense
+  input_encoding: cms
   activation: elu
   layernorm: no
   hidden_dim: 256
   bin_size: 32
-  clip_value_low: 0.0
-  conv_config:
-    type: MPNNNodeFunction
+  distance_dim: 16
+  dropout: 0.0
+  graph_kernel:
+    type: NodePairTrainableKernel
+    output_dim: 32
+    hidden_dim: 32
+    num_layers: 2
+    activation: elu
+  num_graph_layers: 3
+  node_message:
+    type: NodeMessageLearnable
     output_dim: 256
     hidden_dim: 128
     num_layers: 3
     activation: elu
     aggregation_direction: dst
-  num_conv: 1
-  num_gsl: 3
-  distance_dim: 16
-  dropout: 0.0
-  input_encoding: cms
-  graph_kernel: learnable #gaussian, learnable
+  num_node_messages: 1
   skip_connection: yes
   regression_use_classification: yes
   debug: no
diff --git a/parameters/cms-gnn-dense-focal.yaml b/parameters/cms-gnn-dense-focal.yaml
deleted file mode 100644
index 5db4d2177..000000000
--- a/parameters/cms-gnn-dense-focal.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 5.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.01
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config:
-  lr: 2e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: 100
-  dtype: float32
-  trainable: classification
-  classification_loss_type: sigmoid_focal_crossentropy
-  focal_loss_alpha: 0.25
-  focal_loss_gamma: 3.0
-  focal_loss_from_logits: False
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.01
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.2
-  separate_momentum: yes
-  input_encoding: cms
-  debug: no
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
deleted file mode 100644
index ce6fcc2fb..000000000
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  pt_loss: MeanSquaredLogarithmicError
-  energy_loss: MeanSquaredLogarithmicError
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config: all
-  lr: 3e-4
-  batch_size: 32
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 400
-  num_val_files: 100
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy  # categorical_cross_entropy, sigmoid_focal_crossentropy
-  lr_schedule: onecycle  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: inverse_sqrt
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.0
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
-  separate_momentum: yes
-  input_encoding: cms
-  debug: no
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
-
-onecycle:
-  mom_min: 0.85
-  mom_max: 0.95
-  warmup_ratio: 0.3
-  div_factor: 25.0
-  final_div: 100000.0
\ No newline at end of file
diff --git a/parameters/cms-gnn-dense-transfer.yaml b/parameters/cms-gnn-dense-transfer.yaml
deleted file mode 100644
index 8b735f859..000000000
--- a/parameters/cms-gnn-dense-transfer.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights: experiments/cms-gnn-dense-2cc4e7f9.gpu0.local/weights-500-40.204285.hdf5
-  lr: 1e-5
-  batch_size: 20
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 100
-  num_val_files: 100
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: transfer
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 640
-  clip_value_low: 0.0
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-skipconn-v2.yaml b/parameters/cms-gnn-skipconn-v2.yaml
deleted file mode 100644
index e69919342..000000000
--- a/parameters/cms-gnn-skipconn-v2.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: -1
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn
-  bin_size: 640
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16
-  hidden_dim_id: 512
-  hidden_dim_reg: 512
-  distance_dim: 32
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: yes
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-skipconn.yaml b/parameters/cms-gnn-skipconn.yaml
deleted file mode 100644
index b1d2e50f0..000000000
--- a/parameters/cms-gnn-skipconn.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 500
-  num_val_files: -1
-  dtype: float32
-  sample_weights: none
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: gnn
-  bin_size: 640
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16
-  hidden_dim_id: 512
-  hidden_dim_reg: 512
-  distance_dim: 32
-  dropout: 0.0
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: yes
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-transformer-skipconn-gun.yaml b/parameters/cms-transformer-skipconn-gun.yaml
deleted file mode 100644
index f1fdd39e9..000000000
--- a/parameters/cms-transformer-skipconn-gun.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 256
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 10000.0
-  momentum_loss_coef: 0.0
-  charge_loss_coef: 0.0
-  momentum_loss_coefs:
-    - 1.0
-    - 10.0
-    - 100.0
-    - 100.0
-    - 1.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/gun/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 100
-  num_events_train: 250000
-  num_events_test: 50000
-  num_epochs: 100
-  dtype: float32
-  sample_weights: inverse_sqrt
-  trainable: all
-  multi_output: yes
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: transformer
-  num_layers: 2
-  d_model: 256
-  num_heads: 2
-  dff: 256
-  support: 32
-  skip_connection: yes
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-transformer-skipconn.yaml b/parameters/cms-transformer-skipconn.yaml
deleted file mode 100644
index 0cb6eeb31..000000000
--- a/parameters/cms-transformer-skipconn.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
-  pt_loss_coef: 0.001
-  eta_loss_coef: 0.001
-  sin_phi_loss_coef: 0.001
-  cos_phi_loss_coef: 0.001
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-5
-  batch_size: 5
-  num_events_train: 80000
-  num_events_test: 10000
-  num_epochs: 1000
-  dtype: float32
-  sample_weights: none
-  trainable: cls
-  multi_output: yes
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-parameters:
-  model: transformer
-  num_layers: 2
-  d_model: 512
-  num_heads: 2
-  dff: 512
-  support: 32
-  skip_connection: yes
-  dropout: 0.0
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms.yaml
similarity index 74%
rename from parameters/cms-gnn-dense.yaml
rename to parameters/cms.yaml
index 0649b4cfd..a3ab880f1 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms.yaml
@@ -23,17 +23,32 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 10.0
-  charge_loss_coef: 0.1
+  classification_loss_coef: 1.0
+  charge_loss_coef: 1.0
   pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.01
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 100.0
+  cos_phi_loss_coef: 100.0
+  energy_loss_coef: 1.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+  energy_loss:
+    type: Huber
+    delta: 1.0
+  pt_loss:
+    type: Huber
+    delta: 1.0
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
 
 tensorflow:
   eager: no
@@ -46,10 +61,10 @@ setup:
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 500
+  num_epochs: 10
   num_val_files: 10
   dtype: float32
-  trainable: all
+  trainable:
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
@@ -64,23 +79,24 @@ sample_weights:
 
 parameters:
   model: gnn_dense
+  input_encoding: cms
   activation: elu
   layernorm: no
   hidden_dim: 256
   bin_size: 640
-  clip_value_low: 0.01
-  conv_config:
+  distance_dim: 128
+  dropout: 0.0
+  graph_kernel:
+    type: NodePairGaussianKernel
+    dist_mult: 0.1
+    clip_value_low: 0.1
+  num_graph_layers: 2
+  node_message:
     type: GHConvDense
     output_dim: 128
     activation: elu
     normalize_degrees: yes
-  num_conv: 2
-  num_gsl: 2
-  normalize_degrees: yes
-  distance_dim: 128
-  dropout: 0.2
-  input_encoding: cms
-  graph_kernel: gaussian #gaussian, learnable
+  num_node_messages: 2
   skip_connection: yes
   regression_use_classification: yes
   debug: no
diff --git a/parameters/delphes-gnn-skipconn-onecycle.yaml b/parameters/delphes-gnn-skipconn-onecycle.yaml
deleted file mode 100644
index 16259d6b6..000000000
--- a/parameters/delphes-gnn-skipconn-onecycle.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  #(none=0, track=1, cluster=2)
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  momentum_loss_coefs:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    - 0.001
-  raw_path: ../data/mlpf_zenodo/pythia8_ttbar/raw/*.pkl.bz2
-  processed_path: ../data/mlpf_zenodo/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: ../data/mlpf_zenodo/pythia8_qcd/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config: all
-  lr: 1e-5
-  batch_size: 16
-  num_events_train: 40000
-  num_events_test: 5000
-  num_epochs: 250
-  num_val_files: -1
-  dtype: float32
-  trainable: all
-  multi_output: yes
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: onecycle  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-
-parameters:
-  model: gnn
-  bin_size: 128
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16 
-  hidden_dim_id: 256
-  hidden_dim_reg: 256
-  distance_dim: 256
-  dropout: 0.2
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
-
-onecycle:
-  mom_min: 0.85
-  mom_max: 0.95
-  warmup_ratio: 0.3
-  div_factor: 25.0
-  final_div: 100000.0
\ No newline at end of file
diff --git a/parameters/delphes-gnn-skipconn.yaml b/parameters/delphes-gnn-skipconn.yaml
deleted file mode 100644
index 0f83160a2..000000000
--- a/parameters/delphes-gnn-skipconn.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  #(none=0, track=1, cluster=2)
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  momentum_loss_coefs:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    - 0.001
-  raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
-  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights: 
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 100
-  num_events_test: 100
-  num_epochs: 400
-  num_val_files: -1
-  dtype: float32
-  trainable: all
-  multi_output: no
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-
-parameters:
-  model: gnn
-  bin_size: 128
-  num_convs_id: 2
-  num_convs_reg: 2
-  num_hidden_id_enc: 2
-  num_hidden_id_dec: 2
-  num_hidden_reg_enc: 2
-  num_hidden_reg_dec: 2
-  num_neighbors: 16 
-  hidden_dim_id: 256
-  hidden_dim_reg: 256
-  distance_dim: 256
-  dropout: 0.2
-  dist_mult: 1.0
-  activation: elu
-  skip_connection: True
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/delphes-transformer-skipconn.yaml b/parameters/delphes-transformer-skipconn.yaml
deleted file mode 100644
index 9874e5289..000000000
--- a/parameters/delphes-transformer-skipconn.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  momentum_loss_coefs:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    - 0.001
-  raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
-  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/pythia8_ttbar/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 40000
-  num_events_test: 5000
-  num_epochs: 300
-  num_val_files: -1
-  dtype: float16
-  trainable: all
-  multi_output: no
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
-
-parameters:
-  model: transformer
-  num_layers: 4
-  d_model: 128
-  num_heads: 4
-  dff: 128
-  support: 32
-  skip_connection: yes
-  dropout: 0.2
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
new file mode 100644
index 000000000..10e9ac131
--- /dev/null
+++ b/parameters/delphes.yaml
@@ -0,0 +1,102 @@
+backend: tensorflow
+
+dataset:
+  schema: delphes
+  target_particles: gen
+  num_input_features: 12
+  num_output_features: 7
+  #(none=0, track=1, cluster=2)
+  num_input_classes: 3
+  num_output_classes: 6
+  num_momentum_outputs: 5
+  padded_num_elem_size: 6400
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.1
+  pt_loss_coef: 1.0
+  eta_loss_coef: 0.1
+  sin_phi_loss_coef: 1.0
+  cos_phi_loss_coef: 1.0
+  energy_loss_coef: 1.0
+  raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
+  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
+  num_files_per_chunk: 5
+  validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
+  energy_loss:
+    type: Huber
+    delta: 1.0
+  pt_loss:
+    type: Huber
+    delta: 1.0
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+    
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights: 
+  lr: 1e-4
+  batch_size: 5
+  num_events_train: 4000
+  num_events_test: 10000
+  num_epochs: 10
+  num_val_files: 2
+  dtype: float32
+  trainable: all
+  multi_output: yes
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: default
+  activation: elu
+  layernorm: no
+  hidden_dim: 256
+  bin_size: 32
+  distance_dim: 16
+  dropout: 0.0
+  graph_kernel:
+    type: NodePairTrainableKernel
+    output_dim: 32
+    hidden_dim: 32
+    num_layers: 2
+    activation: elu
+  num_graph_layers: 3
+  node_message:
+    type: NodeMessageLearnable
+    output_dim: 256
+    hidden_dim: 128
+    num_layers: 3
+    activation: elu
+    aggregation_direction: dst
+  num_node_messages: 1
+  skip_connection: yes
+  regression_use_classification: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
deleted file mode 100644
index bf0c6f590..000000000
--- a/parameters/test-cms-v2.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-  num_input_classes: 12
-  #(none, ch.had, n.had, hfem, hfhad, gamma, e, mu)
-  num_output_classes: 8
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  activation: elu
-  layernorm: no
-  hidden_dim: 128
-  bin_size: 320
-  clip_value_low: 0.0
-  conv_config:
-    type: GHConvDense
-    output_dim: 128
-    activation: elu
-    normalize_degrees: yes
-  num_conv: 1
-  num_gsl: 1
-  distance_dim: 128
-  dropout: 0.0
-  input_encoding: cms
-  graph_kernel: gaussian #gaussian, learnable
-  skip_connection: yes
-  regression_use_classification: yes
-  debug: no
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index a6e4f1967..eba673909 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -48,24 +48,30 @@ sample_weights:
   sin_phi: signal_only
   cos_phi: signal_only
   energy: signal_only
-  
+
 parameters:
-  model: gnn
-  bin_size: 64
-  num_convs_id: 1
-  num_convs_reg: 1
-  num_hidden_id_enc: 1
-  num_hidden_id_dec: 0
-  num_hidden_reg_enc: 1
-  num_hidden_reg_dec: 0
-  num_neighbors: 16 
-  hidden_dim_id: 64
-  hidden_dim_reg: 64
-  distance_dim: 64
-  dropout: 0.0
-  dist_mult: 1.0
+  model: gnn_dense
+  input_encoding: cms
   activation: elu
-  skip_connection: True
+  layernorm: no
+  hidden_dim: 128
+  bin_size: 320
+  distance_dim: 128
+  dropout: 0.0
+  graph_kernel:
+    type: NodePairGaussianKernel
+    dist_mult: 0.1
+    clip_value_low: 0.1
+  num_graph_layers: 1
+  node_message:
+    type: GHConvDense
+    output_dim: 128
+    activation: elu
+    normalize_degrees: yes
+  num_node_messages: 1
+  regression_use_classification: yes
+  skip_connection: yes
+  debug: no
 
 timing:
   num_ev: 1
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index 87c7208fe..7b93c2ff2 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -41,30 +41,36 @@ setup:
 
 sample_weights:
   cls: none
-  charge: none
-  pt: none
-  eta: none
-  sin_phi: none
-  cos_phi: none
-  energy: none
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
   
 parameters:
-  model: gnn
-  bin_size: 64
-  num_convs_id: 1
-  num_convs_reg: 1
-  num_hidden_id_enc: 1
-  num_hidden_id_dec: 0
-  num_hidden_reg_enc: 1
-  num_hidden_reg_dec: 0
-  num_neighbors: 16 
-  hidden_dim_id: 64
-  hidden_dim_reg: 64
-  distance_dim: 64
-  dropout: 0.0
-  dist_mult: 1.0
+  model: gnn_dense
+  input_encoding: cms
   activation: elu
-  skip_connection: True
+  layernorm: no
+  hidden_dim: 128
+  bin_size: 320
+  distance_dim: 128
+  dropout: 0.0
+  graph_kernel:
+    type: NodePairGaussianKernel
+    dist_mult: 0.1
+    clip_value_low: 0.1
+  num_graph_layers: 1
+  node_message:
+    type: GHConvDense
+    output_dim: 128
+    activation: elu
+    normalize_degrees: yes
+  num_node_messages: 1
+  regression_use_classification: yes
+  skip_connection: yes
+  debug: no
 
 timing:
   num_ev: 1
@@ -74,3 +80,4 @@ exponentialdecay:
   decay_steps: 10000
   decay_rate: 0.99
   staircase: yes
+
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 2f10ec7e2..870517f79 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -32,7 +32,7 @@ rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
-python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action data
+python3 mlpf/pipeline.py data -c parameters/test-cms.yaml
 
 #Run a simple training on a few events
 python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
@@ -40,7 +40,5 @@ python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
 #Generate the pred.npz file of predictions
 python3 mlpf/pipeline.py evaluate -c parameters/test-cms.yaml -t ./experiments/test-cms-*
 
-python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
-
-python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-v2-
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-v2-*
+#Load the model
+python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
\ No newline at end of file
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 3117f8033..d01b4e04b 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -17,7 +17,7 @@ rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/pythia8_ttbar/tfr
-python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action data
+python3 mlpf/pipeline.py data -c parameters/test-delphes.yaml
 
 #Run a simple training on a few events
 python3 mlpf/pipeline.py train -c parameters/test-delphes.yaml -p test-delphes-

From 5cd29d3d476b72b42267c93f24fbe1dbfb690de3 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 16:26:36 +0300
Subject: [PATCH 062/157] fix

---
 mlpf/tfmodel/model.py        | 361 -----------------------------------
 parameters/test-delphes.yaml |   2 +-
 scripts/test_load_tfmodel.py |   4 +-
 3 files changed, 3 insertions(+), 364 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index daaf181cf..56eda3eca 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -166,43 +166,6 @@ def call(self, X):
             X], axis=-1
         )
 
-#https://arxiv.org/pdf/2004.04635.pdf
-#https://github.com/gcucurull/jax-ghnet/blob/master/models.py
-class GHConv(tf.keras.layers.Layer):
-    def __init__(self, *args, **kwargs):
-        self.activation = kwargs.pop("activation")
-
-        super(GHConv, self).__init__(*args, **kwargs)
-
-    def build(self, input_shape):
-        self.hidden_dim = input_shape[0][-1]
-        self.nelem = input_shape[0][-2]
-        self.W_t = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.b_t = self.add_weight(shape=(self.hidden_dim,), name="b_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.W_h = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_h", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.theta = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="theta", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
- 
-    #@tf.function
-    def call(self, inputs):
-        x, adj = inputs
-
-        #compute the normalization of the adjacency matrix
-        in_degrees = tf.sparse.reduce_sum(tf.abs(adj), axis=-1)
-        in_degrees = tf.reshape(in_degrees, (tf.shape(x)[0], tf.shape(x)[1]))
-
-        #add epsilon to prevent numerical issues from 1/sqrt(x)
-        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
-
-        f_hom = tf.linalg.matmul(x, self.theta)
-        f_hom = sparse_dense_matmult_batch(adj, f_hom*norm)*norm
-
-        f_het = tf.linalg.matmul(x, self.W_h)
-        gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
-
-        out = gate*f_hom + (1-gate)*f_het
-        return self.activation(out)
-
-
 class GHConvDense(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
@@ -300,144 +263,6 @@ def get_message_layer(config_dict, name):
 
     return conv_cls(name=name, **config_dict)
 
-
-class SparseHashedNNDistance(tf.keras.layers.Layer):
-    def __init__(self, distance_dim=128, max_num_bins=200, bin_size=500, num_neighbors=5, dist_mult=0.1, **kwargs):
-        super(SparseHashedNNDistance, self).__init__(**kwargs)
-        self.num_neighbors = tf.constant(num_neighbors)
-        self.dist_mult = dist_mult
-        self.distance_dim = distance_dim
-
-        #generate the codebook for LSH hashing at model instantiation for up to this many bins
-        #set this to a high-enough value at model generation to take into account the largest possible input 
-        self.max_num_bins = tf.constant(max_num_bins)
-
-        #each bin will receive this many input elements, in total we can accept max_num_bins*bin_size input elements
-        #in each bin, we will do a dense top_k evaluation
-        self.bin_size = bin_size
-        self.layer_encoding = point_wise_feed_forward_network(distance_dim, 128, "enc")
-        self.layer_edge = point_wise_feed_forward_network(1, 128, "edge")
-
-    def build(self, input_shape):
-        #(n_batch, n_points, n_features)
-
-        #generate the LSH codebook for random rotations (num_features, max_num_bins/2)
-        self.codebook_random_rotations = self.add_weight(
-            shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal", trainable=False, name="lsh_projections"
-        )
-
-    #@tf.function
-    def call(self, inputs, training=True):
-
-        #(n_batch, n_points, n_features)
-        point_embedding = self.layer_encoding(inputs)
-        
-        n_batches = tf.shape(point_embedding)[0]
-        n_points = tf.shape(point_embedding)[1]
-        #points_neighbors = n_points * self.num_neighbors
-
-        #cannot concat sparse tensors directly as that incorrectly destroys the gradient, see
-        #https://github.com/tensorflow/tensorflow/blob/df3a3375941b9e920667acfe72fb4c33a8f45503/tensorflow/python/ops/sparse_grad.py#L33
-        def func(args):
-            ibatch, points_batch = args[0], args[1]
-            bins_split, (inds, vals) = self.construct_sparse_dm_batch(points_batch)
-            inds = tf.concat([tf.expand_dims(tf.cast(ibatch, tf.int64)*tf.ones(tf.shape(inds)[0], dtype=tf.int64), -1), inds], axis=-1)
-            return inds, vals, bins_split
-
-        elems = (tf.range(0, n_batches, delta=1, dtype=tf.int64), point_embedding)
-        ret = tf.map_fn(func, elems,
-            fn_output_signature=(
-                tf.TensorSpec((None, 3), tf.int64),
-                tf.TensorSpec((None, ), inputs.dtype),
-                tf.TensorSpec((None, self.bin_size), tf.int32),
-            ),
-            parallel_iterations=2, back_prop=True
-        )
-
-        # #now create a new SparseTensor that is a concatenation of the per-batch tensor indices and values
-        shp = tf.shape(ret[0])
-        dms = tf.SparseTensor(
-            tf.reshape(ret[0], (shp[0]*shp[1], shp[2])),
-            tf.reshape(ret[1], (shp[0]*shp[1],)),
-            (n_batches, n_points, n_points)
-        )
-
-        dm = tf.sparse.reorder(dms)
-
-        i1 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 1]]))
-        i2 = tf.transpose(tf.stack([dm.indices[:, 0], dm.indices[:, 2]]))
-        x1 = tf.gather_nd(inputs, i1)
-        x2 = tf.gather_nd(inputs, i2)
-
-        #run an edge net on (src node, dst node, edge)
-        edge_vals = tf.nn.sigmoid(self.layer_edge(tf.concat([x1, x2, tf.expand_dims(dm.values, axis=-1)], axis=-1)))
-        dm2 = tf.sparse.SparseTensor(indices=dm.indices, values=edge_vals[:, 0], dense_shape=dm.dense_shape)
-
-        return dm2, ret[2]
-
-    #@tf.function
-    def subpoints_to_sparse_matrix(self, subindices, subpoints):
-
-        #find the distance matrix between the given points in all the LSH bins
-        dm = pairwise_gaussian_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
-        dm = tf.exp(-self.dist_mult*dm)
-
-        #dm = pairwise_sigmoid_dist(subpoints, subpoints) #(LSH_bins, points_per_bin, points_per_bin)
-
-        dmshape = tf.shape(dm)
-        nbins = dmshape[0]
-        nelems = dmshape[1]
-
-        #run KNN in the dense distance matrix, accumulate each index pair into a sparse distance matrix
-        top_k = tf.nn.top_k(dm, k=self.num_neighbors)
-        top_k_vals = tf.reshape(top_k.values, (nbins*nelems, self.num_neighbors))
-
-        indices_gathered = tf.map_fn(
-            lambda i: tf.gather_nd(subindices, top_k.indices[:, :, i:i+1], batch_dims=1),
-            tf.range(self.num_neighbors, dtype=tf.int32), fn_output_signature=tf.TensorSpec(None, tf.int32)
-        )
-        indices_gathered = tf.transpose(indices_gathered, [1,2,0])
-
-        def func(i):
-           dst_ind = indices_gathered[:, :, i] #(nbins, nelems)
-           dst_ind = tf.reshape(dst_ind, (nbins*nelems, ))
-           src_ind = tf.reshape(tf.stack(subindices), (nbins*nelems, ))
-           src_dst_inds = tf.cast(tf.transpose(tf.stack([src_ind, dst_ind])), dtype=tf.int64)
-           return src_dst_inds, top_k_vals[:, i]
-
-        ret = tf.map_fn(func, tf.range(0, self.num_neighbors, delta=1, dtype=tf.int32), fn_output_signature=(tf.int64, subpoints.dtype))
-        
-        shp = tf.shape(ret[0])
-        inds = tf.reshape(ret[0], (shp[0]*shp[1], 2))
-        vals = tf.reshape(ret[1], (shp[0]*shp[1],))
-        return inds, vals
-
-    def construct_sparse_dm_batch(self, points):
-        #points: (n_points, n_features) input elements for graph construction
-        n_points = tf.shape(points)[0]
-        n_features = tf.shape(points)[1]
-
-        #compute the number of LSH bins to divide the input points into on the fly
-        #n_points must be divisible by bin_size exactly due to the use of reshape
-        n_bins = tf.math.floordiv(n_points, self.bin_size)
-
-        #put each input item into a bin defined by the softmax output across the LSH embedding
-        mul = tf.linalg.matmul(points, self.codebook_random_rotations[:, :n_bins//2])
-        cmul = tf.concat([mul, -mul], axis=-1)
-
-        #cmul is now an integer in [0..nbins) for each input point
-        #bins_split: (n_bins, bin_size) of integer bin indices, which puts each input point into a bin of size (n_points/n_bins)
-        bins_split = split_indices_to_bins(cmul, n_bins, self.bin_size)
-
-        #parts: (n_bins, bin_size, n_features), the input points divided up into bins
-        parts = tf.gather(points, bins_split)
-
-        #sparse_distance_matrix: (n_points, n_points) sparse distance matrix
-        #where higher values (closer to 1) are associated with points that are closely related
-        sparse_distance_matrix = self.subpoints_to_sparse_matrix(bins_split, parts)
-
-        return bins_split, sparse_distance_matrix
-
 class NodePairGaussianKernel(tf.keras.layers.Layer):
     def __init__(self, clip_value_low=0.0, dist_mult=0.1, **kwargs):
         self.clip_value_low = clip_value_low
@@ -545,53 +370,6 @@ def call(self, x_msg, x_node, msk):
         return bins_split, x_features_binned, dm, msk_f_binned
 
 
-class EncoderDecoderGNN(tf.keras.layers.Layer):
-    def __init__(self, encoders, decoders, dropout, activation, conv, **kwargs):
-        super(EncoderDecoderGNN, self).__init__(**kwargs)
-        name = kwargs.get("name")
-
-        #assert(encoders[-1] == decoders[0])
-        self.encoders = encoders
-        self.decoders = decoders
-
-        self.encoding_layers = []
-        for ilayer, nunits in enumerate(encoders):
-            self.encoding_layers.append(
-                tf.keras.layers.Dense(nunits, activation=activation,
-                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    name="encoding_{}_{}".format(name, ilayer)))
-            if dropout > 0.0:
-                self.encoding_layers.append(tf.keras.layers.Dropout(dropout))
-
-        self.conv = conv
-
-        self.decoding_layers = []
-        for ilayer, nunits in enumerate(decoders):
-            self.decoding_layers.append(
-                tf.keras.layers.Dense(nunits, activation=activation,
-                    kernel_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    bias_regularizer=tf.keras.regularizers.L1(regularizer_weight),
-                    name="decoding_{}_{}".format(name, ilayer)))
-            if dropout > 0.0:
-                self.decoding_layers.append(tf.keras.layers.Dropout(dropout))
-
-    @tf.function
-    def call(self, inputs, distance_matrix, training=True):
-        x = inputs
-
-        for layer in self.encoding_layers:
-            x = layer(x)
-
-        for convlayer in self.conv:
-            x = convlayer([x, distance_matrix])
-
-        for layer in self.decoding_layers:
-            x = layer(x)
-
-        return x
-
-
 class OutputDecoding(tf.keras.layers.Layer):
     def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, **kwargs):
         super(OutputDecoding, self).__init__(**kwargs)
@@ -691,145 +469,6 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
         return ret
 
 
-#Simple message passing based on a matrix multiplication
-#LEGACY, for Delphes paper compatibility!
-class PFNet(tf.keras.Model):
-    def __init__(self,
-        multi_output=False,
-        num_input_classes=8,
-        num_output_classes=3,
-        num_momentum_outputs=3,
-        activation=tf.nn.selu,
-        hidden_dim_id=256,
-        hidden_dim_reg=256,
-        distance_dim=256,
-        convlayer="ghconv",
-        dropout=0.1,
-        bin_size=10,
-        num_node_messagess_id=1,
-        num_node_messagess_reg=1,
-        num_hidden_id_enc=1,
-        num_hidden_id_dec=1,
-        num_hidden_reg_enc=1,
-        num_hidden_reg_dec=1,
-        num_neighbors=5,
-        dist_mult=0.1,
-        skip_connection=False,
-        return_matrix=False):
-
-        super(PFNet, self).__init__()
-        self.activation = activation
-        self.num_dists = 1
-        self.num_momentum_outputs = num_momentum_outputs
-        self.skip_connection = skip_connection
-        self.multi_output = multi_output
-        self.return_matrix = return_matrix
-
-        encoding_id = []
-        decoding_id = []
-        encoding_reg = []
-        decoding_reg = []
-
-        #the encoder outputs and decoder inputs have to have the hidden dim (convlayer size)
-        for ihidden in range(num_hidden_id_enc):
-            encoding_id.append(hidden_dim_id)
-
-        for ihidden in range(num_hidden_id_dec):
-            decoding_id.append(hidden_dim_id)
-
-        for ihidden in range(num_hidden_reg_enc):
-            encoding_reg.append(hidden_dim_reg)
-
-        for ihidden in range(num_hidden_reg_dec):
-            decoding_reg.append(hidden_dim_reg)
-
-        self.enc = InputEncoding(num_input_classes)
-        #self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dist = SparseHashedNNDistance(distance_dim=distance_dim, bin_size=bin_size, num_neighbors=num_neighbors, dist_mult=dist_mult)
-
-        convs_id = []
-        convs_reg = []
-        if convlayer == "sgconv":
-            for iconv in range(num_node_messagess_id):
-                convs_id.append(SGConv(k=1, activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_node_messagess_reg):
-                convs_reg.append(SGConv(k=1, activation=activation, name="conv_reg{}".format(iconv)))
-        elif convlayer == "ghconv":
-            for iconv in range(num_node_messagess_id):
-                convs_id.append(GHConv(activation=activation, name="conv_id{}".format(iconv)))
-            for iconv in range(num_node_messagess_reg):
-                convs_reg.append(GHConv(activation=activation, name="conv_reg{}".format(iconv)))
-
-        self.gnn_id = EncoderDecoderGNN(encoding_id, decoding_id, dropout, activation, convs_id, name="gnn_id")
-        self.layer_id = point_wise_feed_forward_network(num_output_classes, hidden_dim_id, "id", num_layers=3, activation=activation)
-        self.layer_charge = point_wise_feed_forward_network(1, hidden_dim_id, "charge", num_layers=3, activation=activation)
-        
-        self.gnn_reg = EncoderDecoderGNN(encoding_reg, decoding_reg, dropout, activation, convs_reg, name="gnn_reg")
-        self.layer_momentum = point_wise_feed_forward_network(num_momentum_outputs, hidden_dim_reg, "momentum", num_layers=3, activation=activation)
-
-    def call(self, inputs, training=True):
-        X = inputs
-        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, tf.dtypes.float32), -1)
-
-        enc = self.enc(inputs)
-
-        #create a graph structure from the encoded nodes
-        dm, bins = self.dist(enc, training)
-
-        #run graph net for multiclass id prediction
-        x_id = self.gnn_id(enc, dm, training)
-        
-        if self.skip_connection:
-            to_decode = tf.concat([enc, x_id], axis=-1)
-        else:
-            to_decode = tf.concat([x_id], axis=-1)
-
-        out_id_logits = self.layer_id(to_decode)*msk_input
-        out_charge = self.layer_charge(to_decode)*msk_input
-
-        #run graph net for regression output prediction, taking as an additonal input the ID predictions
-        x_reg = self.gnn_reg(tf.concat([enc, tf.cast(out_id_logits, X.dtype)], axis=-1), dm, training)
-
-        if self.skip_connection:
-            to_decode = tf.concat([enc, tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
-        else:
-            to_decode = tf.concat([tf.cast(out_id_logits, X.dtype), x_reg], axis=-1)
-
-        pred_momentum = self.layer_momentum(to_decode)*msk_input
-
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = tf.clip_by_value(out_charge, -2, 2)
-
-        if self.multi_output:
-            ret = {
-                "cls": out_id_softmax,
-                "charge": out_charge,
-                "pt": tf.exp(tf.clip_by_value(pred_momentum[:, :, 0:1], -4, 4)),
-                "eta": pred_momentum[:, :, 1:2],
-                "sin_phi": pred_momentum[:, :, 2:3],
-                "cos_phi": pred_momentum[:, :, 3:4],
-                "energy": tf.exp(tf.clip_by_value(pred_momentum[:, :, 4:5], -5, 6))
-            }
-            if self.return_matrix:
-                ret["dm"] = dm
-                ret["bins"] = bins
-            return ret
-        else:
-            return tf.concat([out_id_softmax, out_charge, pred_momentum], axis=-1)
-
-    def set_trainable_classification(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.gnn_id.trainable = True
-        self.layer_id.trainable = True
-
-    def set_trainable_regression(self):
-        for layer in self.layers:
-            layer.trainable = False
-        self.gnn_reg.trainable = True
-        self.layer_momentum.trainable = True
-
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
     
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index 7b93c2ff2..5bacba3c8 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -50,7 +50,7 @@ sample_weights:
   
 parameters:
   model: gnn_dense
-  input_encoding: cms
+  input_encoding: default
   activation: elu
   layernorm: no
   hidden_dim: 128
diff --git a/scripts/test_load_tfmodel.py b/scripts/test_load_tfmodel.py
index de854e422..2e30b6346 100644
--- a/scripts/test_load_tfmodel.py
+++ b/scripts/test_load_tfmodel.py
@@ -2,7 +2,7 @@
 import sys
 import numpy as np
 
-bin_size = 128
+bin_size = 640
 num_features = 15
 
 def load_graph(frozen_graph_filename):
@@ -28,5 +28,5 @@ def load_graph(frozen_graph_filename):
 graph = load_graph(sys.argv[1])
 
 with tf.compat.v1.Session(graph=graph) as sess:
-    out = sess.run("Identity:0", feed_dict={"x:0": np.random.randn(1, 39*bin_size, num_features)})
+    out = sess.run("Identity:0", feed_dict={"x:0": np.random.randn(1, 10*bin_size, num_features)})
     print(out)

From a2223edcf277ce68eca67f03c50f2c097bd49d7e Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 16:32:00 +0300
Subject: [PATCH 063/157] purge PFNet

---
 mlpf/tfmodel/model_setup.py | 40 ++++---------------------------------
 1 file changed, 4 insertions(+), 36 deletions(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 9f65f0b9d..171024b8e 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -1,4 +1,4 @@
-from .model import DummyNet, PFNet, PFNetDense
+from .model import DummyNet, PFNetDense
 
 import tensorflow as tf
 import tensorflow_probability
@@ -321,45 +321,13 @@ def scale_outputs(X,y,w):
 
 def make_model(config, dtype):
     model = config['parameters']['model']
-    if model == 'gnn':
-        return make_gnn(config, dtype)
-    elif model == 'dense':
+
+    if model == 'dense':
         return make_dense(config, dtype)
     elif model == 'gnn_dense':
         return make_gnn_dense(config, dtype)
-    raise KeyError("Unknown model type {}".format(model))
-
-def make_gnn(config, dtype):
-    activation = getattr(tf.nn, config['parameters']['activation'])
-
-    parameters = [
-        'bin_size',
-        'num_node_messagess_id',
-        'num_node_messagess_reg',
-        'num_hidden_id_enc',
-        'num_hidden_id_dec',
-        'num_hidden_reg_enc',
-        'num_hidden_reg_dec',
-        'num_neighbors',
-        'hidden_dim_id',
-        'hidden_dim_reg',
-        'dist_mult',
-        'distance_dim',
-        'dropout',
-        'skip_connection'
-    ]
-    kwargs = {par: config['parameters'][par] for par in parameters}
 
-    model = PFNet(
-        multi_output=config["setup"]["multi_output"],
-        num_input_classes=config["dataset"]["num_input_classes"],
-        num_output_classes=config["dataset"]["num_output_classes"],
-        num_momentum_outputs=config["dataset"]["num_momentum_outputs"],
-        activation=activation,
-        **kwargs
-    )
-
-    return model
+    raise KeyError("Unknown model type {}".format(model))
 
 def make_gnn_dense(config, dtype):
 

From e0adb69811586ce4f43bc1f4cba13ac872d94844 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:06:16 +0300
Subject: [PATCH 064/157] update

---
 notebooks/delphes-tf-mlpf-quickstart.ipynb | 179 ++++++++++++++-------
 1 file changed, 122 insertions(+), 57 deletions(-)

diff --git a/notebooks/delphes-tf-mlpf-quickstart.ipynb b/notebooks/delphes-tf-mlpf-quickstart.ipynb
index beed8fa92..39988fbfe 100644
--- a/notebooks/delphes-tf-mlpf-quickstart.ipynb
+++ b/notebooks/delphes-tf-mlpf-quickstart.ipynb
@@ -2,7 +2,6 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "incredible-pressing",
    "metadata": {},
    "source": [
     "This quickstart notebook allows to test and mess around with the MLPF GNN model in a standalone way. For actual training, we don't use a notebook, please refer to `README.md`.\n",
@@ -18,7 +17,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "comparative-stockholm",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,13 +24,13 @@
     "import numpy as np\n",
     "import tensorflow as tf\n",
     "import sklearn\n",
+    "import sklearn.metrics\n",
     "import matplotlib.pyplot as plt"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "limited-prisoner",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,19 +41,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dominant-thumb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import tfmodel\n",
-    "import tfmodel.model as mlpf_model\n",
-    "from tfmodel.model_setup import PFNetLoss"
+    "import tfmodel"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "billion-rental",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +59,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fatal-residence",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -75,7 +68,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "facial-screening",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,7 +78,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "charged-defense",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,19 +103,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "intellectual-trout",
    "metadata": {},
    "outputs": [],
    "source": [
     "#Get the first event\n",
     "input_classes = np.unique(X[:, :, 0].flatten())\n",
-    "output_classes = np.unique(y[:, :, 0].flatten())"
+    "output_classes = np.unique(y[:, :, 0].flatten())\n",
+    "num_output_classes = len(output_classes)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "optimum-automation",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -134,7 +124,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "metropolitan-burton",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,67 +133,154 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "systematic-aquarium",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#ygen = (pid, charge, momentum values)\n",
-    "num_momentum_outputs = data[\"ygen\"][0].shape[1] - 2"
+    "def transform_target(y):\n",
+    "    return {\n",
+    "        \"cls\": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),\n",
+    "        \"charge\": y[:, :, 1:2],\n",
+    "        \"pt\": y[:, :, 2:3],\n",
+    "        \"eta\": y[:, :, 3:4],\n",
+    "        \"sin_phi\": y[:, :, 4:5],\n",
+    "        \"cos_phi\": y[:, :, 5:6],\n",
+    "        \"energy\": y[:, :, 6:7],\n",
+    "    }\n",
+    "yt = transform_target(y)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "plain-flooring",
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = mlpf_model.PFNet(\n",
+    "from tfmodel.model import PFNetDense"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msk_true_particle = y[:, :, 0]!=0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.unique(y[msk_true_particle][:, 0], return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"pt\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"pt\")\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"eta\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"eta\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"sin_phi\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"sin phi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"cos_phi\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"cos phi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(yt[\"energy\"][msk_true_particle].flatten(), bins=100);\n",
+    "plt.xlabel(\"energy\")\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = PFNetDense(\n",
     "    num_input_classes=len(input_classes),\n",
     "    num_output_classes=len(output_classes),\n",
-    "    num_momentum_outputs=num_momentum_outputs,\n",
     "    activation=tf.nn.elu,\n",
+    "    hidden_dim=128,\n",
     "    bin_size=128,\n",
-    "    num_neighbors=16\n",
+    "    input_encoding=\"default\",\n",
+    "    multi_output=True\n",
     ")\n",
     "\n",
-    "#combined multiclass + regression loss\n",
-    "loss = PFNetLoss(\n",
-    "    num_input_classes=len(input_classes),\n",
-    "    num_output_classes=len(output_classes),\n",
-    "    \n",
-    "    #(pt, eta, sin phi, cos phi, E)\n",
-    "    momentum_loss_coefs=[0.001, 1.0, 1.0, 1.0, 0.001]\n",
-    ")\n",
-    "\n",
-    "#temporal weight mode means each input element in the event can get a separate weight\n",
-    "model.compile(loss=loss.my_loss_full, optimizer=\"adam\", sample_weight_mode=\"temporal\")"
+    "# #temporal weight mode means each input element in the event can get a separate weight\n",
+    "model.compile(\n",
+    "    loss={\n",
+    "        \"cls\": tf.keras.losses.CategoricalCrossentropy(from_logits=False),\n",
+    "        \"charge\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"pt\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"energy\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"eta\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"sin_phi\": tf.keras.losses.MeanSquaredError(),\n",
+    "        \"cos_phi\": tf.keras.losses.MeanSquaredError()\n",
+    "    },\n",
+    "    optimizer=\"adam\",\n",
+    "    sample_weight_mode=\"temporal\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "patient-rating",
    "metadata": {},
    "outputs": [],
    "source": [
-    "X.shape, y.shape"
+    "model(X[:1])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "egyptian-working",
    "metadata": {},
    "outputs": [],
    "source": [
-    "history = model.fit(X[:80], y[:80], validation_data=(X[80:], y[80:]), batch_size=5, epochs=10)"
+    "model.fit(X, yt, epochs=2, batch_size=5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "shaped-bryan",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -214,55 +290,44 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "passive-sitting",
    "metadata": {},
    "outputs": [],
    "source": [
     "#index of the class prediction output values\n",
     "pred_id_offset = len(output_classes)\n",
-    "ypred_ids_raw = ypred[:, :, :pred_id_offset]\n",
-    "ypred_charge = ypred[:, :, pred_id_offset:pred_id_offset+1]\n",
-    "ypred_momentum = ypred[:, :, pred_id_offset+1:]"
+    "ypred_ids_raw = ypred[\"cls\"]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "virtual-reflection",
    "metadata": {},
    "outputs": [],
    "source": [
     "sklearn.metrics.confusion_matrix(\n",
     "    np.argmax(ypred_ids_raw, axis=-1).flatten(),\n",
-    "    y[:, :, 0].flatten(), labels=output_classes\n",
+    "    np.argmax(yt[\"cls\"], axis=-1).flatten(), labels=output_classes\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "distinct-sierra",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#compare the predicted and true charge\n",
-    "np.stack([ypred_charge[:, :, 0].flatten(), y[:, :, 1].flatten()]).T"
+    "msk_particles = (X[:, :, 0]!=0)\n",
+    "plt.scatter(\n",
+    "    ypred[\"eta\"][msk_particles].flatten(),\n",
+    "    yt[\"eta\"][msk_particles].flatten(), marker=\".\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "split-detail",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "#first two values in the y array are ID anc charge\n",
-    "momentum_offset = 2\n",
-    "\n",
-    "#momentum eta component\n",
-    "imomentum = 1\n",
-    "plt.scatter(ypred_momentum[:, :, imomentum].flatten(), y[:, :, imomentum+momentum_offset].flatten(), marker=\".\")"
-   ]
+   "source": []
   }
  ],
  "metadata": {
@@ -281,7 +346,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,

From be7e2b065df8ad7ae502f5fa04ce1ba063a9f8b2 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:07:01 +0300
Subject: [PATCH 065/157] many preds

---
 notebooks/delphes_model_analysis.ipynb | 38 ++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/notebooks/delphes_model_analysis.ipynb b/notebooks/delphes_model_analysis.ipynb
index 8eb936c72..bf0649cf4 100644
--- a/notebooks/delphes_model_analysis.ipynb
+++ b/notebooks/delphes_model_analysis.ipynb
@@ -420,6 +420,44 @@
     "```"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def load_many_preds(path):\n",
+    "#     Xs = []\n",
+    "#     ygens = []\n",
+    "#     ycands = []\n",
+    "#     ypreds = []\n",
+    "\n",
+    "#     for fi in glob.glob(path):\n",
+    "#         dd = np.load(fi)\n",
+    "#         Xs.append(dd[\"X\"])\n",
+    "#         ygens.append(dd[\"ygen\"])\n",
+    "#         ycands.append(dd[\"ycand\"])\n",
+    "#         ypreds.append(dd[\"ypred\"])\n",
+    "        \n",
+    "#     X = np.concatenate(Xs)\n",
+    "#     msk_X = X[:, :, 0]!=0\n",
+    "\n",
+    "#     ygen = np.concatenate(ygens)\n",
+    "#     ycand = np.concatenate(ycands)\n",
+    "#     ypred = np.concatenate(ypreds)\n",
+    "\n",
+    "#     return X, ygen, ycand, ypred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# X, ygen, ycand, ypred = load_many_preds(\"../experiments/delphes_20210821_160504.joosep-desktop/evaluation/*.npz\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 59dbb6406a3d2375fdfa6ee56fc1bf3c848b183a Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:09:14 +0300
Subject: [PATCH 066/157] cleanup scripts

---
 mlpf/tallinn/{cms-gnn-dense.sh => cms-dev.sh} |  4 +--
 mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh       | 10 -------
 mlpf/tallinn/cms-gnn-dense-focal-retrain.sh   | 10 -------
 mlpf/tallinn/cms-gnn-dense-focal.sh           | 10 -------
 mlpf/tallinn/cms-gnn-dense-transfer.sh        | 10 -------
 mlpf/tallinn/cms-gnn-skipconn-v2.sh           | 10 -------
 mlpf/tallinn/cms-gnn-skipconn.sh              | 11 -------
 mlpf/tallinn/cms-transformer-skipconn-gun.sh  | 10 -------
 mlpf/tallinn/cms-transformer-skipconn.sh      | 10 -------
 mlpf/tallinn/{cms-gnn-dense-dev.sh => cms.sh} |  4 +--
 mlpf/tallinn/delphes-dense.sh                 | 10 -------
 mlpf/tallinn/delphes-gnn-skipconn.sh          | 10 -------
 mlpf/tallinn/delphes-gnn.sh                   | 10 -------
 mlpf/tallinn/delphes-transformer-skipconn.sh  | 10 -------
 mlpf/tallinn/delphes-transformer.sh           | 10 -------
 mlpf/tallinn/opt_master.sh                    | 11 -------
 mlpf/tallinn/opt_tuner.sh                     | 12 --------
 mlpf/tallinn/train.sh                         | 30 -------------------
 18 files changed, 4 insertions(+), 188 deletions(-)
 rename mlpf/tallinn/{cms-gnn-dense.sh => cms-dev.sh} (81%)
 delete mode 100755 mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh
 delete mode 100755 mlpf/tallinn/cms-gnn-dense-focal-retrain.sh
 delete mode 100755 mlpf/tallinn/cms-gnn-dense-focal.sh
 delete mode 100755 mlpf/tallinn/cms-gnn-dense-transfer.sh
 delete mode 100755 mlpf/tallinn/cms-gnn-skipconn-v2.sh
 delete mode 100755 mlpf/tallinn/cms-gnn-skipconn.sh
 delete mode 100755 mlpf/tallinn/cms-transformer-skipconn-gun.sh
 delete mode 100755 mlpf/tallinn/cms-transformer-skipconn.sh
 rename mlpf/tallinn/{cms-gnn-dense-dev.sh => cms.sh} (80%)
 delete mode 100755 mlpf/tallinn/delphes-dense.sh
 delete mode 100755 mlpf/tallinn/delphes-gnn-skipconn.sh
 delete mode 100755 mlpf/tallinn/delphes-gnn.sh
 delete mode 100755 mlpf/tallinn/delphes-transformer-skipconn.sh
 delete mode 100755 mlpf/tallinn/delphes-transformer.sh
 delete mode 100755 mlpf/tallinn/opt_master.sh
 delete mode 100755 mlpf/tallinn/opt_tuner.sh
 delete mode 100755 mlpf/tallinn/train.sh

diff --git a/mlpf/tallinn/cms-gnn-dense.sh b/mlpf/tallinn/cms-dev.sh
similarity index 81%
rename from mlpf/tallinn/cms-gnn-dense.sh
rename to mlpf/tallinn/cms-dev.sh
index 423ccf8e2..4061d4b85 100755
--- a/mlpf/tallinn/cms-gnn-dense.sh
+++ b/mlpf/tallinn/cms-dev.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 #SBATCH -p gpu
-#SBATCH --gpus 5
+#SBATCH --gpus 4
 #SBATCH --mem-per-gpu=8G
 
 IMG=/home/software/singularity/base.simg:latest
 cd ~/particleflow
 
 #TF training
-singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-gnn-dense.yaml
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-dev.yaml
diff --git a/mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh b/mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh
deleted file mode 100755
index 4d01fc3be..000000000
--- a/mlpf/tallinn/cms-gen-gnn-skipconn-v2.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec -B /home -B /scratch-persistent --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gen-gnn-skipconn-v2.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-dense-focal-retrain.sh b/mlpf/tallinn/cms-gnn-dense-focal-retrain.sh
deleted file mode 100755
index dfc675c1a..000000000
--- a/mlpf/tallinn/cms-gnn-dense-focal-retrain.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-dense-focal.yaml --action train --modifier retrain_energy --recreate --weights experiments/cms-gnn-dense-focal-285ae825.gpu0.local/weights-300-1.175282.hdf5
diff --git a/mlpf/tallinn/cms-gnn-dense-focal.sh b/mlpf/tallinn/cms-gnn-dense-focal.sh
deleted file mode 100755
index 765fcec67..000000000
--- a/mlpf/tallinn/cms-gnn-dense-focal.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-dense-focal.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-dense-transfer.sh b/mlpf/tallinn/cms-gnn-dense-transfer.sh
deleted file mode 100755
index 203a79635..000000000
--- a/mlpf/tallinn/cms-gnn-dense-transfer.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 4
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-dense-transfer.yaml --action train --recreate
diff --git a/mlpf/tallinn/cms-gnn-skipconn-v2.sh b/mlpf/tallinn/cms-gnn-skipconn-v2.sh
deleted file mode 100755
index 5b3a5563f..000000000
--- a/mlpf/tallinn/cms-gnn-skipconn-v2.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-skipconn-v2.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-skipconn.sh b/mlpf/tallinn/cms-gnn-skipconn.sh
deleted file mode 100755
index 15205e9ce..000000000
--- a/mlpf/tallinn/cms-gnn-skipconn.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-skipconn.yaml --action train --weights experiments/cms-gnn-skipconn-9f17890f/weights-500-0.994515.hdf5
-#CUDA_VISIBLE_DEVICES=0 singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-gnn-skipconn.yaml --action eval --weights experiments/cms-gnn-skipconn-6cfe8834/weights-328-1.010852.hdf5
diff --git a/mlpf/tallinn/cms-transformer-skipconn-gun.sh b/mlpf/tallinn/cms-transformer-skipconn-gun.sh
deleted file mode 100755
index 43a8bab6f..000000000
--- a/mlpf/tallinn/cms-transformer-skipconn-gun.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 2
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-transformer-skipconn-gun.yaml --action train
diff --git a/mlpf/tallinn/cms-transformer-skipconn.sh b/mlpf/tallinn/cms-transformer-skipconn.sh
deleted file mode 100755
index 854905880..000000000
--- a/mlpf/tallinn/cms-transformer-skipconn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/cms-transformer-skipconn.yaml --action train
diff --git a/mlpf/tallinn/cms-gnn-dense-dev.sh b/mlpf/tallinn/cms.sh
similarity index 80%
rename from mlpf/tallinn/cms-gnn-dense-dev.sh
rename to mlpf/tallinn/cms.sh
index 7d1eda4a8..9e614cc5d 100755
--- a/mlpf/tallinn/cms-gnn-dense-dev.sh
+++ b/mlpf/tallinn/cms.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 #SBATCH -p gpu
-#SBATCH --gpus 2
+#SBATCH --gpus 4
 #SBATCH --mem-per-gpu=8G
 
 IMG=/home/software/singularity/base.simg:latest
 cd ~/particleflow
 
 #TF training
-singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-gnn-dense-dev.yaml
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms.yaml
diff --git a/mlpf/tallinn/delphes-dense.sh b/mlpf/tallinn/delphes-dense.sh
deleted file mode 100755
index 9e4a497de..000000000
--- a/mlpf/tallinn/delphes-dense.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow/delphes
-
-#TF training
-singularity exec --nv $IMG python3 ../mlpf/tensorflow/delphes_model.py --model-spec parameters/delphes-dense.yaml --action train
diff --git a/mlpf/tallinn/delphes-gnn-skipconn.sh b/mlpf/tallinn/delphes-gnn-skipconn.sh
deleted file mode 100755
index fa1857855..000000000
--- a/mlpf/tallinn/delphes-gnn-skipconn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/delphes-gnn-skipconn.yaml --action train
diff --git a/mlpf/tallinn/delphes-gnn.sh b/mlpf/tallinn/delphes-gnn.sh
deleted file mode 100755
index db2f3739e..000000000
--- a/mlpf/tallinn/delphes-gnn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 2
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow/delphes
-
-#TF training
-singularity exec --nv $IMG python3 ../mlpf/tensorflow/delphes_model.py parameters/delphes-gnn.yaml
diff --git a/mlpf/tallinn/delphes-transformer-skipconn.sh b/mlpf/tallinn/delphes-transformer-skipconn.sh
deleted file mode 100755
index cd096c986..000000000
--- a/mlpf/tallinn/delphes-transformer-skipconn.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 5
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/launcher.py --model-spec parameters/delphes-transformer-skipconn.yaml --action train
diff --git a/mlpf/tallinn/delphes-transformer.sh b/mlpf/tallinn/delphes-transformer.sh
deleted file mode 100755
index 6ffbb9675..000000000
--- a/mlpf/tallinn/delphes-transformer.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 4
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow/delphes
-
-#TF training
-singularity exec --nv $IMG python3 ../mlpf/tensorflow/delphes_model.py parameters/delphes-transformer.yaml
diff --git a/mlpf/tallinn/opt_master.sh b/mlpf/tallinn/opt_master.sh
deleted file mode 100755
index 383de9ad6..000000000
--- a/mlpf/tallinn/opt_master.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 0
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-export SINGULARITYENV_KERASTUNER_TUNER_ID="chief"
-export SINGULARITYENV_KERASTUNER_ORACLE_IP="127.0.0.1"
-export SINGULARITYENV_KERASTUNER_ORACLE_PORT="8000"
-singularity exec -B /scratch $IMG python3 mlpf/tensorflow/opt.py
diff --git a/mlpf/tallinn/opt_tuner.sh b/mlpf/tallinn/opt_tuner.sh
deleted file mode 100755
index 80aedae22..000000000
--- a/mlpf/tallinn/opt_tuner.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 1
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#export SINGULARITYENV_KERASTUNER_TUNER_ID="tuner-${SLURM_JOB_ID}"
-#export SINGULARITYENV_KERASTUNER_ORACLE_IP="127.0.0.1"
-#export SINGULARITYENV_KERASTUNER_ORACLE_PORT="8000"
-
-singularity exec -B /scratch --nv $IMG python3 mlpf/tensorflow/opt.py
diff --git a/mlpf/tallinn/train.sh b/mlpf/tallinn/train.sh
deleted file mode 100755
index bb0f5cddf..000000000
--- a/mlpf/tallinn/train.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-#SBATCH -p gpu
-#SBATCH --gpus 1
-#SBATCH --mem-per-gpu=8G
-
-IMG=/home/software/singularity/base.simg:latest
-cd ~/particleflow
-
-#TF training
-singularity exec --nv $IMG python3 mlpf/tensorflow/tf_model.py \
-  --datapath ./data/TTbar_14TeV_TuneCUETP8M1_cfi \
-  --target cand --ntrain 70000 --ntest 20000 --convlayer ghconv \
-  --lr 1e-5 --nepochs 100 --num-neighbors 10 \
-  --num-hidden-id-enc 1 --num-hidden-id-dec 2 \
-  --num-hidden-reg-enc 1 --num-hidden-reg-dec 2 \
-  --bin-size 100 --hidden-dim-id 256 --hidden-dim-reg 256 \
-  --batch-size 5 --distance-dim 256 \
-  --dropout 0.0 \
-  --num-convs-id 3 --num-convs-reg 3 --load experiments/run_13/weights.27-*.hdf5
-
-#Pytorch  training
-#singularity exec -B /home --nv $IMG \
-#  python3 test/train_end2end.py \
-#  --dataset /home/joosep/particleflow/data/TTbar_14TeV_TuneCUETP8M1_cfi \
-#  --n_train 400 --n_val 100 \
-#  --model PFNet7 --convlayer gravnet-radius --lr 0.005 \
-#  --hidden_dim 32 --n_epochs 100 \
-#  --l1 1000.0 --l2 100.0 --l3 1000.0 --space_dim 2 --nearest 5 --convlayer2 sgconv \
-#  --target cand --batch_size 1 --activation leaky_relu \
-#  --dropout 0.0 --encoding_dim 256 --optimizer adamw --radius 0.01 --input-encoding 0

From 6c14d4924ce16802e21c87b52569583c7d727de4 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:09:53 +0300
Subject: [PATCH 067/157] remove timing

---
 scripts/local_test_delphes_pipeline.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index d01b4e04b..58c550a77 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -24,7 +24,3 @@ python3 mlpf/pipeline.py train -c parameters/test-delphes.yaml -p test-delphes-
 
 #Generate the pred.npz file of predictions
 python3 mlpf/pipeline.py evaluate -c parameters/test-delphes.yaml -t ./experiments/test-delphes-*
-
-#Generate the timing file
-python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action time --weights ./experiments/test-delphes-*/weights/weights-01-*.hdf5
-

From a0d2189bb1c9bfd7bc1c7f82d3b7faf791a797df Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:37:42 +0300
Subject: [PATCH 068/157] run evaluation with a possibly changed path

---
 mlpf/pipeline.py | 54 ++++++++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 988b592ca..00af55c24 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -196,16 +196,6 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
 
     print("Training done.")
 
-    print("Starting evaluation...")
-    eval_dir = Path(outdir) / "evaluation"
-    eval_dir.mkdir()
-    eval_dir = str(eval_dir)
-    # TODO: change to use the evaluate() function below instead of eval_model()
-    eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
-    print("Evaluation done.")
-
-    freeze_model(model, config, outdir)
-
 
 @main.command()
 @click.help_option("-h", "--help")
@@ -213,7 +203,8 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
 @click.option("-e", "--evaluation_dir", help="optionally specify evaluation output dir", type=click.Path())
-def evaluate(config, train_dir, weights, evaluation_dir):
+@click.option("-v", "--validation_files", help="optionally override validation file path", type=click.Path(), default=None)
+def evaluate(config, train_dir, weights, evaluation_dir, validation_files):
     """Evaluate the trained model in train_dir"""
     if config is None:
         config = Path(train_dir) / "config.yaml"
@@ -237,32 +228,31 @@ def evaluate(config, train_dir, weights, evaluation_dir):
         model_dtype = tf.dtypes.float32
 
     dataset_def = get_dataset_def(config)
+    
+    if not (validation_files is None):
+        dataset_def.val_filelist = glob.glob(str(validation_files))
+
     X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
-    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
-    if maybe_global_batch_size is not None:
-        global_batch_size = maybe_global_batch_size
+    model = make_model(config, model_dtype)
 
-    with strategy.scope():
-        model = make_model(config, model_dtype)
+    # Evaluate model once to build the layers
+    print(X_val.shape)
+    model(tf.cast(X_val[:1], model_dtype))
 
-        # Evaluate model once to build the layers
-        print(X_val.shape)
-        model(tf.cast(X_val[:1], model_dtype))
-
-        # need to load the weights in the same trainable configuration as the model was set up
-        configure_model_weights(model, config["setup"].get("weights_config", "all"))
-        if weights:
-            model.load_weights(weights, by_name=True)
-        else:
-            weights = get_best_checkpoint(train_dir)
-            print("Loading best weights that could be found from {}".format(weights))
-            model.load_weights(weights, by_name=True)
-        model(tf.cast(X_val[:1], model_dtype))
+    # need to load the weights in the same trainable configuration as the model was set up
+    configure_model_weights(model, config["setup"].get("weights_config", "all"))
+    if weights:
+        model.load_weights(weights, by_name=True)
+    else:
+        weights = get_best_checkpoint(train_dir)
+        print("Loading best weights that could be found from {}".format(weights))
+        model.load_weights(weights, by_name=True)
+    model(tf.cast(X_val[:1], model_dtype))
 
-        model.compile()
-        eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
-        freeze_model(model, config, train_dir)
+    model.compile()
+    eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
+    freeze_model(model, config, train_dir)
 
 
 @main.command()

From afdf6a0609333d2388ef9b5f3a4de3a382665302 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:40:08 +0300
Subject: [PATCH 069/157] update

---
 notebooks/delphes_model_analysis.ipynb | 239 +++++++++++++------------
 1 file changed, 120 insertions(+), 119 deletions(-)

diff --git a/notebooks/delphes_model_analysis.ipynb b/notebooks/delphes_model_analysis.ipynb
index bf0649cf4..bb5c406a7 100644
--- a/notebooks/delphes_model_analysis.ipynb
+++ b/notebooks/delphes_model_analysis.ipynb
@@ -415,8 +415,11 @@
    "metadata": {},
    "source": [
     "Once the training is done, we can generate the pred.npz file using the following:\n",
+    "\n",
     "```bash\n",
-    "singularity exec --nv ~/HEP-KBFI/singularity/base.simg python3 ../mlpf/tensorflow/delphes_model.py --action validate --weights weights-300-*.hdf5\n",
+    "singularity exec --nv ~/HEP-KBFI/singularity/base.simg python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_20210821_160504.joosep-desktop -e experiments/delphes_20210821_160504.joosep-desktop/evaluation_ttbar -v \"data/pythia8_ttbar/val/tev14_pythia8_ttbar_*.pkl.bz2\"\n",
+    "\n",
+    "singularity exec --nv ~/HEP-KBFI/singularity/base.simg python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_20210821_160504.joosep-desktop -e experiments/delphes_20210821_160504.joosep-desktop/evaluation_qcd -v \"data/pythia8_qcd/val/tev14_pythia8_qcd_*.pkl.bz2\"\n",
     "```"
    ]
   },
@@ -426,36 +429,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# def load_many_preds(path):\n",
-    "#     Xs = []\n",
-    "#     ygens = []\n",
-    "#     ycands = []\n",
-    "#     ypreds = []\n",
+    "def load_many_preds(path):\n",
+    "    Xs = []\n",
+    "    ygens = []\n",
+    "    ycands = []\n",
+    "    ypreds = []\n",
     "\n",
-    "#     for fi in glob.glob(path):\n",
-    "#         dd = np.load(fi)\n",
-    "#         Xs.append(dd[\"X\"])\n",
-    "#         ygens.append(dd[\"ygen\"])\n",
-    "#         ycands.append(dd[\"ycand\"])\n",
-    "#         ypreds.append(dd[\"ypred\"])\n",
+    "    for fi in glob.glob(path):\n",
+    "        dd = np.load(fi)\n",
+    "        Xs.append(dd[\"X\"])\n",
+    "        ygens.append(dd[\"ygen\"])\n",
+    "        ycands.append(dd[\"ycand\"])\n",
+    "        ypreds.append(dd[\"ypred\"])\n",
     "        \n",
-    "#     X = np.concatenate(Xs)\n",
-    "#     msk_X = X[:, :, 0]!=0\n",
+    "    X = np.concatenate(Xs)\n",
+    "    msk_X = X[:, :, 0]!=0\n",
     "\n",
-    "#     ygen = np.concatenate(ygens)\n",
-    "#     ycand = np.concatenate(ycands)\n",
-    "#     ypred = np.concatenate(ypreds)\n",
+    "    ygen = np.concatenate(ygens)\n",
+    "    ycand = np.concatenate(ycands)\n",
+    "    ypred = np.concatenate(ypreds)\n",
     "\n",
-    "#     return X, ygen, ycand, ypred"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# X, ygen, ycand, ypred = load_many_preds(\"../experiments/delphes_20210821_160504.joosep-desktop/evaluation/*.npz\")"
+    "    return X, ygen, ycand, ypred\n",
+    "\n",
+    "# For current model\n",
+    "# X_ttbar, ygen_ttbar, ycand_ttbar, ypred_ttbar = load_many_preds(\"../experiments/delphes_20210821_160504.joosep-desktop/evaluation_ttbar/*.npz\")\n",
+    "# X, ygen, ycand, ypred = load_many_preds(\"../experiments/delphes_20210821_160504.joosep-desktop/evaluation_qcd/*.npz\")"
    ]
   },
   {
@@ -464,6 +462,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# For the model from the paper\n",
     "#Load the predictions file from the model (this can take a while, as the file is compressed and pretty large)\n",
     "fi_qcd = np.load(open(\"pred_qcd.npz\", \"rb\"))\n",
     "fi_ttbar = np.load(open(\"pred_ttbar.npz\", \"rb\"))\n",
@@ -496,18 +495,18 @@
    "outputs": [],
    "source": [
     "#Flatten the events\n",
-    "ygen = flatten(ygen)\n",
-    "ycand = flatten(ycand)\n",
-    "ypred = flatten(ypred)\n",
-    "X = flatten(X)\n",
-    "msk_X = X[:, 0] != 0\n",
+    "ygen_f = flatten(ygen)\n",
+    "ycand_f = flatten(ycand)\n",
+    "ypred_f = flatten(ypred)\n",
+    "X_f = flatten(X)\n",
+    "msk_X_f = X_f[:, 0] != 0\n",
     "\n",
     "#Flatten the events\n",
-    "ygen_ttbar = flatten(ygen_ttbar)\n",
-    "ycand_ttbar = flatten(ycand_ttbar)\n",
-    "ypred_ttbar = flatten(ypred_ttbar)\n",
-    "X_ttbar = flatten(X_ttbar)\n",
-    "msk_X_ttbar = X[:, 0] != 0"
+    "ygen_ttbar_f = flatten(ygen_ttbar)\n",
+    "ycand_ttbar_f = flatten(ycand_ttbar)\n",
+    "ypred_ttbar_f = flatten(ypred_ttbar)\n",
+    "X_ttbar_f = flatten(X_ttbar)\n",
+    "msk_X_ttbar_f = X_ttbar_f[:, 0] != 0"
    ]
   },
   {
@@ -516,9 +515,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ygen.shape)\n",
-    "print(ycand.shape)\n",
-    "print(ypred.shape)"
+    "print(ygen_f.shape)\n",
+    "print(ycand_f.shape)\n",
+    "print(ypred_f.shape)\n",
+    "\n",
+    "print(ygen_ttbar_f.shape)\n",
+    "print(ycand_ttbar_f.shape)\n",
+    "print(ypred_ttbar_f.shape)"
    ]
   },
   {
@@ -530,17 +533,17 @@
     "def plot_pt_eta(ygen, legend_title=\"\"):\n",
     "    b = np.linspace(0, 100, 41)\n",
     "\n",
-    "    msk_pid1 = (ygen[:, 0]==1)\n",
-    "    msk_pid2 = (ygen[:, 0]==2)\n",
-    "    msk_pid3 = (ygen[:, 0]==3)\n",
-    "    msk_pid4 = (ygen[:, 0]==4)\n",
-    "    msk_pid5 = (ygen[:, 0]==5)\n",
+    "    msk_pid1 = (ygen_f[:, 0]==1)\n",
+    "    msk_pid2 = (ygen_f[:, 0]==2)\n",
+    "    msk_pid3 = (ygen_f[:, 0]==3)\n",
+    "    msk_pid4 = (ygen_f[:, 0]==4)\n",
+    "    msk_pid5 = (ygen_f[:, 0]==5)\n",
     "\n",
-    "    h1 = np.histogram(ygen[msk_pid1, 2], bins=b)\n",
-    "    h2 = np.histogram(ygen[msk_pid2, 2], bins=b)\n",
-    "    h3 = np.histogram(ygen[msk_pid3, 2], bins=b)\n",
-    "    h4 = np.histogram(ygen[msk_pid4, 2], bins=b)\n",
-    "    h5 = np.histogram(ygen[msk_pid5, 2], bins=b)\n",
+    "    h1 = np.histogram(ygen_f[msk_pid1, 2], bins=b)\n",
+    "    h2 = np.histogram(ygen_f[msk_pid2, 2], bins=b)\n",
+    "    h3 = np.histogram(ygen_f[msk_pid3, 2], bins=b)\n",
+    "    h4 = np.histogram(ygen_f[msk_pid4, 2], bins=b)\n",
+    "    h5 = np.histogram(ygen_f[msk_pid5, 2], bins=b)\n",
     "\n",
     "    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
@@ -557,11 +560,11 @@
     "    ax1.set_ylabel(\"Truth particles\")\n",
     "\n",
     "    b = np.linspace(-8, 8, 41)\n",
-    "    h1 = np.histogram(ygen[msk_pid1, 3], bins=b)\n",
-    "    h2 = np.histogram(ygen[msk_pid2, 3], bins=b)\n",
-    "    h3 = np.histogram(ygen[msk_pid3, 3], bins=b)\n",
-    "    h4 = np.histogram(ygen[msk_pid4, 3], bins=b)\n",
-    "    h5 = np.histogram(ygen[msk_pid5, 3], bins=b)\n",
+    "    h1 = np.histogram(ygen_f[msk_pid1, 3], bins=b)\n",
+    "    h2 = np.histogram(ygen_f[msk_pid2, 3], bins=b)\n",
+    "    h3 = np.histogram(ygen_f[msk_pid3, 3], bins=b)\n",
+    "    h4 = np.histogram(ygen_f[msk_pid4, 3], bins=b)\n",
+    "    h5 = np.histogram(ygen_f[msk_pid5, 3], bins=b)\n",
     "    xs = midpoints(h1[1])\n",
     "    width = np.diff(h1[1])\n",
     "\n",
@@ -666,20 +669,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_num_particles_pid(fi, pid=0, ax=None, legend_title=\"\"):\n",
+    "def plot_num_particles_pid(ygen, ycand, ypred, pid=0, ax=None, legend_title=\"\"):\n",
     "    if not ax:\n",
     "        plt.figure(figsize=(4,4))\n",
     "        ax = plt.axes()\n",
     "    \n",
     "    #compute the number of particles per event\n",
     "    if pid == 0:\n",
-    "        x1 = np.sum(fi[\"ygen\"][:, :, 0]!=pid, axis=1)\n",
-    "        x2 = np.sum(fi[\"ypred\"][:, :, 0]!=pid, axis=1)\n",
-    "        x3 = np.sum(fi[\"ycand\"][:, :, 0]!=pid, axis=1)\n",
+    "        x1 = np.sum(ygen[:, :, 0]!=pid, axis=1)\n",
+    "        x2 = np.sum(ypred[:, :, 0]!=pid, axis=1)\n",
+    "        x3 = np.sum(ycand[:, :, 0]!=pid, axis=1)\n",
     "    else:\n",
-    "        x1 = np.sum(fi[\"ygen\"][:, :, 0]==pid, axis=1)\n",
-    "        x2 = np.sum(fi[\"ypred\"][:, :, 0]==pid, axis=1)\n",
-    "        x3 = np.sum(fi[\"ycand\"][:, :, 0]==pid, axis=1)\n",
+    "        x1 = np.sum(ygen[:, :, 0]==pid, axis=1)\n",
+    "        x2 = np.sum(ypred[:, :, 0]==pid, axis=1)\n",
+    "        x3 = np.sum(ycand[:, :, 0]==pid, axis=1)\n",
     "        \n",
     "    v0 = np.min([np.min(x1), np.min(x2), np.min(x3)])\n",
     "    v1 = np.max([np.max(x1), np.max(x2), np.max(x3)])\n",
@@ -731,8 +734,8 @@
     "        \"x1\": x1, \"x2\": x2, \"x3\": x3}\n",
     "\n",
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
-    "ret_num_particles_ch_had = plot_num_particles_pid(fi_qcd, 1, ax1, legend_title=sample_title_qcd+\"\\n\")\n",
-    "ret_num_particles_n_had = plot_num_particles_pid(fi_qcd, 2, ax2, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ret_num_particles_ch_had = plot_num_particles_pid(ygen, ycand, ypred, 1, ax1, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ret_num_particles_n_had = plot_num_particles_pid(ygen, ycand, ypred, 2, ax2, legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax1)\n",
     "plt.tight_layout()\n",
     "plt.savefig(\"plots/num_particles.pdf\", bbox_inches=\"tight\")\n",
@@ -748,8 +751,8 @@
    "outputs": [],
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
-    "ret_num_particles_ch_had_ttbar = plot_num_particles_pid(fi_ttbar, 1, ax1)\n",
-    "ret_num_particles_n_had_ttbar = plot_num_particles_pid(fi_ttbar, 2, ax2)\n",
+    "ret_num_particles_ch_had_ttbar = plot_num_particles_pid(ygen_ttbar, ycand_ttbar, ypred_ttbar, 1, ax1)\n",
+    "ret_num_particles_n_had_ttbar = plot_num_particles_pid(ygen_ttbar, ycand_ttbar, ypred_ttbar, 2, ax2)\n",
     "sample_string_ttbar(ax1)\n",
     "plt.tight_layout()\n",
     "plt.savefig(\"plots/num_particles_ttbar.pdf\", bbox_inches=\"tight\")\n",
@@ -793,13 +796,13 @@
     "def draw_efficiency_fakerate(ygen, ypred, ycand, pid, var, bins, both=True, legend_title=\"\"):\n",
     "    var_idx = var_indices[var]\n",
     "\n",
-    "    msk_gen = ygen[:, 0]==pid\n",
-    "    msk_pred = ypred[:, 0]==pid\n",
-    "    msk_cand = ycand[:, 0]==pid\n",
+    "    msk_gen = ygen_f[:, 0]==pid\n",
+    "    msk_pred = ypred_f[:, 0]==pid\n",
+    "    msk_cand = ycand_f[:, 0]==pid\n",
     "\n",
-    "    hist_gen = np.histogram(ygen[msk_gen, var_idx], bins=bins);\n",
-    "    hist_cand = np.histogram(ygen[msk_gen & msk_cand, var_idx], bins=bins);\n",
-    "    hist_pred = np.histogram(ygen[msk_gen & msk_pred, var_idx], bins=bins);\n",
+    "    hist_gen = np.histogram(ygen_f[msk_gen, var_idx], bins=bins);\n",
+    "    hist_cand = np.histogram(ygen_f[msk_gen & msk_cand, var_idx], bins=bins);\n",
+    "    hist_pred = np.histogram(ygen_f[msk_gen & msk_pred, var_idx], bins=bins);\n",
     "    \n",
     "    hist_gen = mask_empty(hist_gen)\n",
     "    hist_cand = mask_empty(hist_cand)\n",
@@ -828,10 +831,10 @@
     "    ax1.set_xlabel(var_names[var])\n",
     "    ax1.set_ylabel(\"Efficiency\")\n",
     "\n",
-    "    hist_cand2 = np.histogram(ygen[msk_cand & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
-    "    hist_pred2 = np.histogram(ygen[msk_pred & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
-    "    hist_cand_gen2 = np.histogram(ygen[msk_cand & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
-    "    hist_pred_gen2 = np.histogram(ygen[msk_pred & ~msk_gen & (ygen[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_cand2 = np.histogram(ygen_f[msk_cand & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_pred2 = np.histogram(ygen_f[msk_pred & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_cand_gen2 = np.histogram(ygen_f[msk_cand & ~msk_gen & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
+    "    hist_pred_gen2 = np.histogram(ygen_f[msk_pred & ~msk_gen & (ygen_f[:, 0]!=0), var_idx], bins=bins);\n",
     "\n",
     "    hist_cand2 = mask_empty(hist_cand2)\n",
     "    hist_cand_gen2 = mask_empty(hist_cand_gen2)\n",
@@ -916,7 +919,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, \"pt\", np.linspace(0, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ax, _ = draw_efficiency_fakerate(ygen_f, ypred_f, ycand_f, 1, \"pt\", np.linspace(0, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid1_pt.pdf\", bbox_inches=\"tight\")\n",
     "PDF(\"plots/eff_fake_pid1_pt.pdf\", size=(300,300))"
@@ -928,7 +931,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, \"eta\", np.linspace(-3, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
+    "ax, _ = draw_efficiency_fakerate(ygen_f, ypred_f, ycand_f, 1, \"eta\", np.linspace(-3, 3, 61), both=False, legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid1_eta.pdf\", bbox_inches=\"tight\")\n",
     "PDF(\"plots/eff_fake_pid1_eta.pdf\", size=(300,300))"
@@ -941,7 +944,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    2, \"energy\", np.linspace(5, 205, 61), legend_title=sample_title_qcd+\"\\n\")\n",
     "#sample_string_qcd(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid2_energy.pdf\", bbox_inches=\"tight\")\n",
@@ -955,7 +958,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen_ttbar, ypred_ttbar, ycand_ttbar,\n",
+    "    ygen_ttbar_f, ypred_ttbar_f, ycand_ttbar_f,\n",
     "    2, \"energy\", np.linspace(5, 205, 61), legend_title=sample_title_ttbar+\"\\n\")\n",
     "#sample_string_ttbar(ax)\n",
     "plt.savefig(\"plots/eff_fake_pid2_energy_ttbar.pdf\", bbox_inches=\"tight\")\n",
@@ -969,7 +972,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    2, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -984,7 +987,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    3, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -999,7 +1002,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    4, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -1014,7 +1017,7 @@
    "outputs": [],
    "source": [
     "ax, _ = draw_efficiency_fakerate(\n",
-    "    ygen, ypred, ycand,\n",
+    "    ygen_f, ypred_f, ycand_f,\n",
     "    5, \"eta\", np.linspace(-6, 6, 61), legend_title=sample_title_qcd+\"\\n\"\n",
     ")\n",
     "#sample_string_qcd(ax)\n",
@@ -1080,8 +1083,8 @@
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
-    "res_ch_had_pt = plot_reso(ygen, ypred, ycand, 1, \"pt\", 2, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
-    "res_ch_had_eta = plot_reso(ygen, ypred, ycand, 1, \"eta\", 0.2, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_ch_had_pt = plot_reso(ygen_f, ypred_f, ycand_f, 1, \"pt\", 2, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_ch_had_eta = plot_reso(ygen_f, ypred_f, ycand_f, 1, \"eta\", 0.2, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
     "\n",
     "ax1.set_ylim(100, 10**11)\n",
     "ax2.set_ylim(100, 10**11)\n",
@@ -1099,8 +1102,8 @@
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
-    "res_n_had_e = plot_reso(ygen, ypred, ycand, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
-    "res_n_had_eta = plot_reso(ygen, ypred, ycand, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_n_had_e = plot_reso(ygen_f, ypred_f, ycand_f, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_qcd+\"\\n\")\n",
+    "res_n_had_eta = plot_reso(ygen_f, ypred_f, ycand_f, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_qcd+\"\\n\")\n",
     "\n",
     "#ax1.set_title(\"Neutral hadrons\")\n",
     "#sample_string_qcd(ax1)\n",
@@ -1119,8 +1122,8 @@
    "source": [
     "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 2*8))\n",
     "\n",
-    "plot_reso(ygen_ttbar, ypred_ttbar, ycand_ttbar, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_ttbar+\"\\n\")\n",
-    "plot_reso(ygen_ttbar, ypred_ttbar, ycand_ttbar, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_ttbar+\"\\n\")\n",
+    "plot_reso(ygen_ttbar_f, ypred_ttbar_f, ycand_ttbar_f, 2, \"energy\", 5, ax=ax1, legend_title=sample_title_ttbar+\"\\n\")\n",
+    "plot_reso(ygen_ttbar_f, ypred_ttbar_f, ycand_ttbar_f, 2, \"eta\", 0.5, ax=ax2, legend_title=sample_title_ttbar+\"\\n\")\n",
     "\n",
     "#ax1.set_title(\"Neutral hadrons\")\n",
     "#sample_string_ttbar(ax1)\n",
@@ -1144,20 +1147,20 @@
    "outputs": [],
    "source": [
     "confusion = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ycand[msk_X, 0], normalize=\"true\"\n",
+    "    ygen_f[msk_X, 0], ycand_f[msk_X, 0], normalize=\"true\"\n",
     ")\n",
     "\n",
     "confusion2 = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ypred[msk_X, 0], normalize=\"true\"\n",
+    "    ygen_f[msk_X, 0], ypred_f[msk_X, 0], normalize=\"true\"\n",
     ")\n",
     "\n",
     "\n",
     "confusion_unnorm = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ycand[msk_X, 0],\n",
+    "    ygen_f[msk_X, 0], ycand_f[msk_X, 0],\n",
     ")\n",
     "\n",
     "confusion2_unnorm = sklearn.metrics.confusion_matrix(\n",
-    "    ygen[msk_X, 0], ypred[msk_X, 0],\n",
+    "    ygen_f[msk_X, 0], ypred_f[msk_X, 0],\n",
     ")"
    ]
   },
@@ -1185,7 +1188,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.accuracy_score(ygen[msk_X, 0], ycand[msk_X, 0])"
+    "sklearn.metrics.accuracy_score(ygen_f[msk_X, 0], ycand_f[msk_X, 0])"
    ]
   },
   {
@@ -1194,7 +1197,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.accuracy_score(ygen[msk_X, 0], ypred[msk_X, 0])"
+    "sklearn.metrics.accuracy_score(ygen_f[msk_X, 0], ypred_f[msk_X, 0])"
    ]
   },
   {
@@ -1297,11 +1300,11 @@
     "\n",
     "axes = axes.flatten()\n",
     "for iax, i in enumerate([1,2,3,4,5]):\n",
-    "    axes[iax].hist(ypred[ypred[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
-    "    axes[iax].hist(ygen[ygen[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
+    "    axes[iax].hist(ypred_f[ypred_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
+    "    axes[iax].hist(ygen_f[ygen_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
     "    #axes[iax].hist(ycand[ycand[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"pink\", ls=\"-\", label=\"QCD PF\");\n",
-    "    axes[iax].hist(ypred_ttbar[ypred_ttbar[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
-    "    axes[iax].hist(ygen_ttbar[ygen_ttbar[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
+    "    axes[iax].hist(ypred_ttbar_f[ypred_ttbar_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
+    "    axes[iax].hist(ygen_ttbar_f[ygen_ttbar_f[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
     "    #axes[iax].hist(ycand_ttbar[ycand_ttbar[:, 0]==i, 2], bins=b, histtype=\"step\", lw=1, color=\"cyan\", ls=\"-\", label=r\"$t\\bar{t}$ PF\");\n",
     "    axes[iax].set_yscale(\"log\")\n",
     "    axes[iax].legend(ncol=2)\n",
@@ -1326,11 +1329,11 @@
     "\n",
     "axes = axes.flatten()\n",
     "for iax, i in enumerate([1,2,3,4,5]):\n",
-    "    axes[iax].hist(ypred[ypred[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
-    "    axes[iax].hist(ygen[ygen[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
+    "    axes[iax].hist(ypred_f[ypred_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"red\", label=\"QCD MLPF\");\n",
+    "    axes[iax].hist(ygen_f[ygen_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"red\", ls=\"--\", label=\"QCD truth\");\n",
     "    #axes[iax].hist(ycand[ycand[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"pink\", ls=\"-\", label=\"QCD PF\");\n",
-    "    axes[iax].hist(ypred_ttbar[ypred_ttbar[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
-    "    axes[iax].hist(ygen_ttbar[ygen_ttbar[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
+    "    axes[iax].hist(ypred_ttbar_f[ypred_ttbar_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=2, color=\"blue\", label=r\"$t\\bar{t}$ MLPF\");\n",
+    "    axes[iax].hist(ygen_ttbar_f[ygen_ttbar_f[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"blue\", ls=\"--\", label=r\"$t\\bar{t}$ truth\");\n",
     "    #axes[iax].hist(ycand_ttbar[ycand_ttbar[:, 0]==i, 6], bins=b, histtype=\"step\", lw=1, color=\"cyan\", ls=\"-\", label=r\"$t\\bar{t}$ PF\");\n",
     "    axes[iax].set_yscale(\"log\")\n",
     "    axes[iax].legend(ncol=2)\n",
@@ -1443,9 +1446,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk_pid_gen = ygen[:, 0]==1\n",
-    "msk_pid_cand = ycand[:, 0]==1\n",
-    "msk_pid_pred = ypred[:, 0]==1"
+    "msk_pid_gen = ygen_f[:, 0]==1\n",
+    "msk_pid_cand = ycand_f[:, 0]==1\n",
+    "msk_pid_pred = ypred_f[:, 0]==1"
    ]
   },
   {
@@ -1454,7 +1457,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.unique(ycand[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
+    "np.unique(ycand_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
    ]
   },
   {
@@ -1481,7 +1484,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.unique(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
+    "np.unique(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 0], return_counts=True)"
    ]
   },
   {
@@ -1490,8 +1493,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF charged hadron, RBPF no charged hadron\");\n",
-    "plt.hist(X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF & RBPF charged hadron\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF charged hadron, RBPF no charged hadron\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 1], bins=np.linspace(0,5,100), density=True, histtype=\"step\", label=\"MLPF & RBPF charged hadron\");\n",
     "plt.legend()\n",
     "plt.xlabel(\"track pT\")"
    ]
@@ -1502,9 +1505,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
-    "plt.hist(X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
-    "plt.legend()\n",
+    "plt.hist(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2], bins=np.linspace(-3,3,100), density=True, histtype=\"step\");\n",
     "plt.xlabel(\"track eta\")"
    ]
   },
@@ -1514,9 +1516,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(X[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
-    "plt.hist(X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
-    "plt.legend()\n",
+    "plt.hist(X_f[(msk_pid_gen) & (~msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
+    "plt.hist(X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 5], bins=np.linspace(0, 10, 100), density=True, histtype=\"step\");\n",
     "plt.xlabel(\"track energy\")"
    ]
   },
@@ -1526,8 +1527,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "a = X[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2]\n",
-    "b = ycand[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 3]"
+    "a = X_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 2]\n",
+    "b = ycand_f[(msk_pid_gen) & (msk_pid_cand) & msk_pid_pred, 3]"
    ]
   },
   {

From 15aa6e4b26c0103d2d67051de13da2bdb334aa36 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 17:48:02 +0300
Subject: [PATCH 070/157] unify configs

---
 mlpf/pipeline.py        |  5 ++---
 parameters/cms-dev.yaml |  2 +-
 parameters/delphes.yaml | 46 +++++++++++++++++++----------------------
 3 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 00af55c24..bc040bed9 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -89,7 +89,7 @@ def data(config):
 @click.option("--ntest", default=None, help="override the number of testing events", type=int)
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
-@click.option("--plot-freq", default=1, help="Plot detailed validation every N epochs", type=str)
+@click.option("--plot-freq", default=1, help="Plot detailed validation every N epochs", type=int)
 def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
     """Train a model defined by config"""
     config_file_path = config
@@ -228,7 +228,7 @@ def evaluate(config, train_dir, weights, evaluation_dir, validation_files):
         model_dtype = tf.dtypes.float32
 
     dataset_def = get_dataset_def(config)
-    
+
     if not (validation_files is None):
         dataset_def.val_filelist = glob.glob(str(validation_files))
 
@@ -254,7 +254,6 @@ def evaluate(config, train_dir, weights, evaluation_dir, validation_files):
     eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
     freeze_model(model, config, train_dir)
 
-
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 5c14c0f94..00e517bf9 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -62,7 +62,7 @@ setup:
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 10
-  num_val_files: 1
+  num_val_files: 10
   dtype: float32
   trainable:
   classification_loss_type: categorical_cross_entropy
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 10e9ac131..e9c27d05f 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -7,15 +7,16 @@ dataset:
   num_output_features: 7
   #(none=0, track=1, cluster=2)
   num_input_classes: 3
+  #(none=0, charged hadron=1, neutral hadron=2, photon=3, electron=4, muon=5) 
   num_output_classes: 6
   num_momentum_outputs: 5
   padded_num_elem_size: 6400
   classification_loss_coef: 1.0
-  charge_loss_coef: 0.1
+  charge_loss_coef: 1.0
   pt_loss_coef: 1.0
-  eta_loss_coef: 0.1
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 100.0
+  cos_phi_loss_coef: 100.0
   energy_loss_coef: 1.0
   raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
   processed_path: data/pythia8_ttbar/tfr/*.tfrecords
@@ -36,22 +37,22 @@ dataset:
   eta_loss:
     type: Huber
     delta: 0.1
-    
+
 tensorflow:
   eager: no
 
 setup:
   train: yes
-  weights: 
+  weights:
+  weights_config:
   lr: 1e-4
   batch_size: 5
-  num_events_train: 4000
-  num_events_test: 10000
+  num_events_train: 45000
+  num_events_test: 5000
   num_epochs: 10
-  num_val_files: 2
+  num_val_files: 5
   dtype: float32
-  trainable: all
-  multi_output: yes
+  trainable:
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
@@ -70,24 +71,19 @@ parameters:
   activation: elu
   layernorm: no
   hidden_dim: 256
-  bin_size: 32
-  distance_dim: 16
+  bin_size: 640
+  distance_dim: 128
   dropout: 0.0
   graph_kernel:
-    type: NodePairTrainableKernel
-    output_dim: 32
-    hidden_dim: 32
-    num_layers: 2
-    activation: elu
-  num_graph_layers: 3
+    type: NodePairGaussianKernel
+    dist_mult: 0.1
+    clip_value_low: 0.1
+  num_graph_layers: 2
   node_message:
-    type: NodeMessageLearnable
-    output_dim: 256
-    hidden_dim: 128
-    num_layers: 3
+    type: GHConvDense
+    output_dim: 128
     activation: elu
-    aggregation_direction: dst
-  num_node_messages: 1
+  num_node_messages: 2
   skip_connection: yes
   regression_use_classification: yes
   debug: no

From e3c10a5228a9c8046dfff8eb0cb61d8a1e8a2db6 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 21 Aug 2021 18:11:19 +0300
Subject: [PATCH 071/157] update docs

---
 README.md                          |  2 +-
 README_cms.md                      |  2 +-
 README_delphes.md                  | 63 ++++++++++++++++--------------
 parameters/cms-dev.yaml            |  2 +-
 parameters/cms.yaml                |  2 +-
 scripts/local_test_cms_pipeline.sh |  2 +-
 6 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 66fc7e6aa..d29529727 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 Short instructions with a single test file in [notebooks/delphes-tf-mlpf-quickstart.ipynb](notebooks/delphes-tf-mlpf-quickstart.ipynb).
 
 Long instructions for reproducing the full training from scratch in [README_delphes.md](README_delphes.md).
-The plots can be generated using the notebook [delphes/resolution_checks.ipynb](delphes/resolution_checks.ipynb).
+The plots can be generated using the notebook [delphes/delphes_model_analysis.ipynb](delphes/delphes_model_analysis.ipynb).
 
 ### Delphes dataset
 The dataset is available from zenodo: https://doi.org/10.5281/zenodo.4559324.
diff --git a/README_cms.md b/README_cms.md
index 83a7574ff..f3bb90f90 100644
--- a/README_cms.md
+++ b/README_cms.md
@@ -33,5 +33,5 @@ git clone https://github.com/jpata/particleflow.git
 cd particleflow
 
 #run a small local test including data prep and training
-./scripts/local_test_cms_tf.sh
+./scripts/local_test_cms_pipeline.sh
 ```
diff --git a/README_delphes.md b/README_delphes.md
index e6a6ea282..859237b9b 100644
--- a/README_delphes.md
+++ b/README_delphes.md
@@ -2,28 +2,55 @@
 
 The following instructions use singularity, but you may have a different local setup.
 
+```bash
+#Download all pkl.bz2 files from https://zenodo.org/record/4559324
+
+#now move the data into the right place
+mv *pythia8_qcd*.pkl.bz2 data/pythia8_qcd/val
+mv *pythia8_ttbar*.pkl.bz2 data/pythia8_qcd/raw
+mv data/pythia8_qcd/raw/*pythia8_ttbar_9_*.pkl.bz2 data/pythia8_qcd/val
+
+# Generate the TFRecord datasets needed for larger-than-RAM training
+python3 mlpf/pipeline.py data -c parameters/delphes.yaml
+
+# Run the training of the base GNN model using e.g. 5 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3,4 python3 mlpf/pipeline.py train -c parameters/delphes.yaml
+
+#Run the validation to produce the predictions file
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes-* -v "data/pythia8_qcd/val/*.pkl.bz2" -e evaluate_qcd
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes-* -v "data/pythia8_ttbar/val/*.pkl.bz2" -e evaluate_ttbar
+```
+
+## Recipe for generation
+The Delphes AngularSmearing module has been modified to correctly take into account the smearing for tracks, see [delphes/install.sh](delphes/install.sh).
+
+```bash
+wget http://atlaswww.hep.anl.gov/hepsim/soft/centos7hepsim.img
+sudo singularity build --sandbox centos7hepsim.sandbox centos7hepsim.img
+sudo singularity exec -B /home --writable centos7hepsim.sandbox ./install.sh
+sudo singularity build centos7hepsim.sif centos7hepsim.sandbox
+sudo rm -Rf centos7hepsim.sandbox
+```
+
 ```bash
 cd delphes
 
 # Run the simulation step
 # Generate events with pythia, mix them with PU and run a detector simulation using Delphes
-singularity exec http://jpata.web.cern.ch/jpata/centos7hepsim.sif ./run_sim.sh
+singularity exec centos7hepsim.sif ./run_sim.sh
 
 # Run the ntuplization step
 # generate X,y input matrices for NN training in out/pythia8_ttbar/*.pkl.bz2
-singularity exec http://jpata.web.cern.ch/jpata/centos7hepsim.sif ./run_ntuple.sh
-singularity exec http://jpata.web.cern.ch/jpata/centos7hepsim.sif ./run_ntuple_qcd.sh
+singularity exec centos7hepsim.sif ./run_ntuple.sh
+singularity exec centos7hepsim.sif ./run_ntuple_qcd.sh
 
-#Alternatively, to skip run_sim.sh and run_ntuple.sh, download everything from https://doi.org/10.5281/zenodo.4452283 and put into out/pythia8_ttbar
-
-#now move the data into the right place
 mv out/pythia8_ttbar ../data/
 cd ../data/pythia8_ttbar
 mkdir raw
 mkdir val
 mkdir root
 mv *.root root/
-mb *.promc root/
+mv *.promc root/
 mv *.pkl.bz2 raw/
 cd ../..
 
@@ -35,26 +62,4 @@ mv *.root root/
 mv *.promc root/
 mv *.pkl.bz2 val/
 cd ../..
-
-# Generate the TFRecord datasets needed for larger-than-RAM training
-singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action data --model-spec parameters/delphes-gnn-skipconn.yaml
-
-# Run the training of the base GNN model using e.g. 5 GPUs in a data-parallel mode
-CUDA_VISIBLE_DEVICES=0,1,2,3,4 singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action train --model-spec parameters/delphes-gnn-skipconn.yaml
-
-#Run the validation to produce the predictions file
-singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action eval --model-spec parameters/delphes-gnn-skipconn.yaml --weights ./experiments/delphes-gnn-skipconn-*/weights-300-*.hdf5
-
-singularity exec --nv http://jpata.web.cern.ch/jpata/base.simg python3 mlpf/launcher.py --action time --model-spec parameters/delphes-gnn-skipconn.yaml --weights ./experiments/delphes-gnn-skipconn-*/weights-300-*.hdf5
-```
-
-## Recipe to prepare Delphes singularity image
-NB: The Delphes AngularSmearing module has been modified to correctly take into account the smearing for tracks, see [delphes/install.sh](delphes/install.sh)
-
-```bash
-wget http://atlaswww.hep.anl.gov/hepsim/soft/centos7hepsim.img
-sudo singularity build --sandbox centos7hepsim.sandbox centos7hepsim.img
-sudo singularity exec -B /home --writable centos7hepsim.sandbox ./install.sh
-sudo singularity build centos7hepsim.sif centos7hepsim.sandbox
-sudo rm -Rf centos7hepsim.sandbox
 ```
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 00e517bf9..20e2ac0de 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -61,7 +61,7 @@ setup:
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 10
+  num_epochs: 50
   num_val_files: 10
   dtype: float32
   trainable:
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index a3ab880f1..d64a9a19f 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -61,7 +61,7 @@ setup:
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 10
+  num_epochs: 50
   num_val_files: 10
   dtype: float32
   trainable:
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 870517f79..20ab50376 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -6,7 +6,7 @@ rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi
 mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/root
 cd data/TTbar_14TeV_TuneCUETP8M1_cfi/root
 
-#Only CMS-internal use is permitted by CMS rules
+#Only CMS-internal use is permitted by CMS rules! Do not use these MC simulation files otherwise!
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_2.root
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_3.root

From d9b5dbe6f10463fa93da7cd8a5078519f66eea36 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 22 Aug 2021 10:05:35 +0300
Subject: [PATCH 072/157] improve encoding

---
 mlpf/tfmodel/lr_finder.py   |  2 +-
 mlpf/tfmodel/model.py       | 48 +++++++++++++++++++++++++------------
 mlpf/tfmodel/model_setup.py |  5 ++--
 parameters/cms-dev.yaml     | 12 ++++------
 parameters/cms.yaml         | 16 ++++++-------
 5 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/mlpf/tfmodel/lr_finder.py b/mlpf/tfmodel/lr_finder.py
index 4b2338581..cba366eb6 100644
--- a/mlpf/tfmodel/lr_finder.py
+++ b/mlpf/tfmodel/lr_finder.py
@@ -46,7 +46,7 @@ def on_train_batch_end(self, batch, logs=None):
             if step == 0 or loss < self.best_loss:
                 self.best_loss = loss
 
-            if smooth_loss > 4 * self.best_loss or tf.math.is_nan(smooth_loss):
+            if smooth_loss > 100 * self.best_loss or tf.math.is_nan(smooth_loss):
                 self.model.stop_training = True
                 print("Loss reached predefined maximum... stopping")
         if step >= self.max_steps:
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 56eda3eca..764bd0fa3 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -371,11 +371,12 @@ def call(self, x_msg, x_node, msk):
 
 
 class OutputDecoding(tf.keras.layers.Layer):
-    def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, **kwargs):
+    def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout, **kwargs):
         super(OutputDecoding, self).__init__(**kwargs)
 
         self.regression_use_classification = regression_use_classification
         self.schema = schema
+        self.dropout = dropout
 
         self.ffn_id = point_wise_feed_forward_network(
             num_output_classes, hidden_dim,
@@ -383,7 +384,8 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
             dtype=tf.dtypes.float32,
             num_layers=4,
             activation=activation,
-            dim_decrease=True
+            dim_decrease=True,
+            dropout=dropout
         )
         self.ffn_charge = point_wise_feed_forward_network(
             1, hidden_dim,
@@ -391,24 +393,29 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
             dtype=tf.dtypes.float32,
             num_layers=2,
             activation=activation,
-            dim_decrease=True
+            dim_decrease=True,
+            dropout=dropout
         )
         
         self.ffn_pt = point_wise_feed_forward_network(
-            2, hidden_dim, "ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+            4, hidden_dim, "ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dropout=dropout
         )
         self.ffn_eta = point_wise_feed_forward_network(
             2, hidden_dim, "ffn_eta",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dropout=dropout
         )
         self.ffn_phi = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_phi",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dropout=dropout
         )
         self.ffn_energy = point_wise_feed_forward_network(
-            2, hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True
+            4, hidden_dim, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dropout=dropout
         )
 
     """
@@ -423,7 +430,7 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
         out_charge = self.ffn_charge(X_encoded_id)*msk_input
 
-        orig_pt = X_input[:, :, 1:2]
+        #orig_pt = X_input[:, :, 1:2]
         orig_eta = X_input[:, :, 2:3]
 
         #FIXME: schema 
@@ -437,7 +444,7 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
             orig_energy = X_input[:, :, 5:6]
 
         if self.regression_use_classification:
-            X_encoded_reg = tf.concat([X_encoded_reg, out_id_logits], axis=-1)
+            X_encoded_reg = tf.concat([X_encoded_reg, tf.stop_gradient(out_id_logits)], axis=-1)
 
         pred_eta_corr = self.ffn_eta(X_encoded_reg)
         pred_phi_corr = self.ffn_phi(X_encoded_reg)
@@ -453,8 +460,13 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
         pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
         pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
         pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
-        pred_energy = orig_energy*energy_sigmoid + tf.exp(tf.clip_by_value((1.0 - energy_sigmoid)*pred_energy_corr[:, :, 1:2], -8, 8))
-        pred_pt = orig_pt*pt_sigmoid + tf.exp(tf.clip_by_value((1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2], -8, 8))
+        
+        pred_energy = (orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*(
+            pred_energy_corr[:, :, 1:2]*orig_energy*orig_energy + pred_energy_corr[:, :, 2:3]*orig_energy + pred_energy_corr[:, :, 3:4]))
+        
+        orig_pt = tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
+        pred_pt = (orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*(
+            pred_pt_corr[:, :, 1:2]*orig_pt*orig_pt + pred_pt_corr[:, :, 2:3]*orig_pt + pred_pt_corr[:, :, 3:4]))
 
         ret = {
             "cls": out_id_softmax,
@@ -485,7 +497,13 @@ def __init__(self, *args, **kwargs):
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
 
-        self.ffn_dist = point_wise_feed_forward_network(self.distance_dim, self.hidden_dim, kwargs.get("name") + "_ffn_dist", num_layers=2, activation="elu", dropout=self.dropout)
+        self.ffn_dist = point_wise_feed_forward_network(
+            self.distance_dim,
+            self.hidden_dim,
+            kwargs.get("name") + "_ffn_dist",
+            num_layers=2, activation="elu",
+            dropout=self.dropout
+        )
         self.message_building_layer = MessageBuildingLayerLSH(
             distance_dim=self.distance_dim,
             max_num_bins=self.max_num_bins,
@@ -579,7 +597,7 @@ def __init__(self,
         self.cg_id = [CombinedGraphLayer(name="cg_id_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
         self.cg_reg = [CombinedGraphLayer(name="cg_reg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
 
-        self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema)
+        self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout)
 
     def call(self, inputs, training=False):
         X = inputs
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 171024b8e..5d1236721 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -214,12 +214,12 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
             vals_true = np.log(vals_true)
             s = "_log"
 
-        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=(1.0+hub_loss))
+        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=(2.0+hub_loss))
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
             if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
-                plt.plot([minval, maxval], [minval, maxval], color="black", ls="--")
+                plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
                 plt.xlim(minval, maxval)
                 plt.ylim(minval, maxval)
 
@@ -262,6 +262,7 @@ def on_epoch_end(self, epoch, logs=None):
                 self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
                 self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
             self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
+            self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, "pt", log=True)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 20e2ac0de..1fb476418 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -35,11 +35,9 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   pt_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -57,7 +55,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-3
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
@@ -81,14 +79,14 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   activation: elu
-  layernorm: no
+  layernorm: yes
   hidden_dim: 256
   bin_size: 32
   distance_dim: 16
   dropout: 0.0
   graph_kernel:
     type: NodePairTrainableKernel
-    output_dim: 32
+    output_dim: 8
     hidden_dim: 32
     num_layers: 2
     activation: elu
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index d64a9a19f..c3d71027d 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -35,11 +35,9 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   pt_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -57,7 +55,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-3
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
@@ -81,16 +79,16 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   activation: elu
-  layernorm: no
+  layernorm: yes
   hidden_dim: 256
-  bin_size: 640
+  bin_size: 320
   distance_dim: 128
   dropout: 0.0
   graph_kernel:
     type: NodePairGaussianKernel
     dist_mult: 0.1
-    clip_value_low: 0.1
-  num_graph_layers: 2
+    clip_value_low: 0.0
+  num_graph_layers: 3
   node_message:
     type: GHConvDense
     output_dim: 128

From 1abdef5b1afe29ae6c0bc92d3da099771a192542 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 22 Aug 2021 19:40:22 +0300
Subject: [PATCH 073/157] up

---
 mlpf/tfmodel/model.py       |  8 ++++----
 mlpf/tfmodel/model_setup.py | 14 ++++++++------
 parameters/cms-dev.yaml     |  8 +++++---
 parameters/cms.yaml         |  8 +++++---
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 764bd0fa3..62082caba 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -399,22 +399,22 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         
         self.ffn_pt = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=False,
             dropout=dropout
         )
         self.ffn_eta = point_wise_feed_forward_network(
             2, hidden_dim, "ffn_eta",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
             dropout=dropout
         )
         self.ffn_phi = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_phi",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
             dropout=dropout
         )
         self.ffn_energy = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=False,
             dropout=dropout
         )
 
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 5d1236721..78b26a82a 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -201,12 +201,14 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
         vals_pred = ypred[reg_variable][msk][sel].flatten()
         vals_true = self.ytrue[reg_variable][msk][sel].flatten()
 
-        #manually as in configuration, later can propagate
-        delta = 0.1
+        #FIXME: propagate from configuration
         if reg_variable == "energy" or reg_variable == "pt":
             delta = 1.0
-        hub = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
-        hub_loss = hub(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
+        else:
+            delta = 0.1
+        
+        loss = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
+        loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
 
         s = ""
         if log:
@@ -214,7 +216,7 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
             vals_true = np.log(vals_true)
             s = "_log"
 
-        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=(2.0+hub_loss))
+        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=loss_vals)
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
@@ -225,7 +227,7 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
 
         plt.xlabel("predicted")
         plt.ylabel("true")
-        plt.title("{}, HL={:.4f}".format(reg_variable, np.sum(hub_loss)))
+        plt.title("{}, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
         plt.savefig(str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s)), bbox_inches="tight")
         plt.close("all")
 
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 1fb476418..9981dd964 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -35,9 +35,11 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
+    delta: 1.0
   pt_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
+    delta: 1.0
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -55,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index c3d71027d..b14193a83 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -35,9 +35,11 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
+    delta: 1.0
   pt_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
+    delta: 1.0
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -55,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000

From 06cacb0bd7a2bf173eac936e396d5c1916866619 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 22 Aug 2021 20:13:04 +0300
Subject: [PATCH 074/157] add residual plots

---
 mlpf/tfmodel/model_setup.py | 29 ++++++++++++++++++++++-------
 parameters/cms-dev.yaml     |  6 ++----
 parameters/cms.yaml         |  6 ++----
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 78b26a82a..58610f887 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -191,7 +191,7 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variabl
         plt.savefig(str(outpath / "{}_cls{}.png".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
-    def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False):
+    def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False):
 
         if icls==0:
             sel = (self.ytrue_id[msk]!=0) & (ypred_id[msk]!=0)
@@ -203,13 +203,13 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
 
         #FIXME: propagate from configuration
         if reg_variable == "energy" or reg_variable == "pt":
-            delta = 1.0
+            loss = tf.keras.losses.MeanSquaredLogarithmicError(reduction=tf.keras.losses.Reduction.NONE)
         else:
             delta = 0.1
-        
-        loss = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
+            loss = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
         loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
 
+        #suffix for log-transformed variable
         s = ""
         if log:
             vals_pred = np.log(vals_pred)
@@ -231,6 +231,21 @@ def plot_corr(self, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False
         plt.savefig(str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s)), bbox_inches="tight")
         plt.close("all")
 
+        #Also plot the residuals, as we have the true and predicted values already available here
+        plt.figure()
+        residual = vals_true - vals_pred
+        residual[np.isnan(residual)] = 0
+        residual[np.isinf(residual)] = 0
+        plt.hist(residual, bins=100)
+        plt.xlabel("true - pred")
+        plt.title("{} residual, m={:.2f} s={:.2f}".format(reg_variable, np.mean(residual), np.std(residual)))
+        plt.savefig(str(outpath / "{}_residual{}.png".format(reg_variable, s)), bbox_inches="tight")
+        plt.close("all")
+
+        # FIXME: for some reason, these don't end up on the tensorboard
+        # tf.summary.scalar('residual_{}{}_mean'.format(reg_variable, s), data=np.mean(residual), step=epoch)
+        # tf.summary.scalar('residual_{}{}_std'.format(reg_variable, s), data=np.std(residual), step=epoch)
+
     def on_epoch_end(self, epoch, logs=None):
 
         if epoch%self.plot_freq!=0:
@@ -262,9 +277,9 @@ def on_epoch_end(self, epoch, logs=None):
             cp_dir_cls.mkdir(parents=True, exist_ok=True)
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
                 self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
-                self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
-            self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
-            self.plot_corr(cp_dir_cls, ypred, ypred_id, msk, icls, "pt", log=True)
+                self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, variable)
+            self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
+            self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "pt", log=True)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 9981dd964..93bacb32c 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -35,11 +35,9 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   pt_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   sin_phi_loss:
     type: Huber
     delta: 0.1
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index b14193a83..8d9b63307 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -35,11 +35,9 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   pt_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   sin_phi_loss:
     type: Huber
     delta: 0.1

From f7d0cb416ecc424152fa4d742a53d7c68dd92699 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Mon, 23 Aug 2021 13:02:24 +0300
Subject: [PATCH 075/157] separate energy graph layer

---
 mlpf/pipeline.py            |  7 ++++-
 mlpf/tfmodel/model.py       | 55 +++++++++++++++++++++++++----------
 mlpf/tfmodel/model_setup.py |  2 +-
 mlpf/tfmodel/utils.py       |  2 +-
 notebooks/pfnet-debug.ipynb | 58 ++++++++++++++++++++++++++++++-------
 parameters/cms-dev.yaml     |  8 ++---
 parameters/cms.yaml         |  8 ++---
 parameters/delphes.yaml     |  6 ++--
 8 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index bc040bed9..086142873 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -166,11 +166,16 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
         )
         model.summary()
 
+    validation_particles = None
+    if config["dataset"]["target_particles"] == "cand":
+        validation_particles = ycand_val
+    elif config["dataset"]["target_particles"] == "gen":
+        validation_particles = ycand_val
     callbacks = prepare_callbacks(
         model,
         outdir,
         X_val,
-        ycand_val,
+        validation_particles,
         dataset_transform,
         config["dataset"]["num_output_classes"],
         dataset_def,
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 62082caba..00d2ac687 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -413,8 +413,13 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
             dropout=dropout
         )
         self.ffn_energy = point_wise_feed_forward_network(
-            4, hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=False,
+            1, hidden_dim, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            dropout=dropout
+        )
+        self.ffn_energy_sigmoid = point_wise_feed_forward_network(
+            1, hidden_dim, "ffn_energy_sigmoid",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
 
@@ -424,7 +429,7 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
     X_encoded_reg: (n_batch, n_elements, n_encoded_features)
     msk_input: (n_batch, n_elements) boolean mask
     """
-    def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
+    def call(self, X_input, X_encoded_id, X_encoded_reg, X_encoded_energy, msk_input):
 
         out_id_logits = self.ffn_id(X_encoded_id)*msk_input
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
@@ -444,29 +449,27 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, msk_input):
             orig_energy = X_input[:, :, 5:6]
 
         if self.regression_use_classification:
-            X_encoded_reg = tf.concat([X_encoded_reg, tf.stop_gradient(out_id_logits)], axis=-1)
+            X_encoded_reg = tf.concat([X_encoded_reg, out_id_logits], axis=-1)
 
         pred_eta_corr = self.ffn_eta(X_encoded_reg)
         pred_phi_corr = self.ffn_phi(X_encoded_reg)
-        pred_energy_corr = self.ffn_energy(X_encoded_reg)
-        pred_pt_corr = self.ffn_pt(X_encoded_reg)
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
+        pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
+
         sin_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
         cos_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
-        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        
-        pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
         pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
         pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
+
+        pred_energy_corr = self.ffn_energy(X_encoded_energy)
+        energy_sigmoid = tf.keras.activations.sigmoid(self.ffn_energy_sigmoid(X_encoded_energy))
+        pred_energy = orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*pred_energy_corr[:, :, 0:1]
         
-        pred_energy = (orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*(
-            pred_energy_corr[:, :, 1:2]*orig_energy*orig_energy + pred_energy_corr[:, :, 2:3]*orig_energy + pred_energy_corr[:, :, 3:4]))
-        
+        pred_pt_corr = self.ffn_pt(X_encoded_energy)
         orig_pt = tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
-        pred_pt = (orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*(
-            pred_pt_corr[:, :, 1:2]*orig_pt*orig_pt + pred_pt_corr[:, :, 2:3]*orig_pt + pred_pt_corr[:, :, 3:4]))
+        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+        pred_pt = orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2]
 
         ret = {
             "cls": out_id_softmax,
@@ -582,6 +585,7 @@ def __init__(self,
 
         self.ffn_enc_id = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_id", activation=activation)
         self.ffn_enc_reg = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_reg", activation=activation)
+        self.ffn_enc_energy = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_energy", activation=activation)
 
         kwargs_cg = {
             "max_num_bins": max_num_bins,
@@ -596,6 +600,7 @@ def __init__(self,
         }
         self.cg_id = [CombinedGraphLayer(name="cg_id_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
         self.cg_reg = [CombinedGraphLayer(name="cg_reg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
+        self.cg_energy = [CombinedGraphLayer(name="cg_energy_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
 
         self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout)
 
@@ -630,6 +635,16 @@ def call(self, inputs, training=False):
                 debugging_data[cg.name] = enc_reg_all
             encs_reg.append(enc_reg)
 
+        #encode the elements for energy regression
+        enc_energy = self.activation(self.ffn_enc_energy(enc))
+        encs_energy = []
+        for cg in self.cg_energy:
+            enc_energy_all = cg(enc_energy, msk, training)
+            enc_energy = enc_energy_all["enc"]
+            if self.debug:
+                debugging_data[cg.name] = enc_energy_all
+            encs_energy.append(enc_energy)
+
         dec_input_cls = []
         if self.skip_connection:
             dec_input_cls.append(enc)
@@ -646,7 +661,15 @@ def call(self, inputs, training=False):
         if self.debug:
             debugging_data["dec_output_reg"] = dec_output_reg
 
-        ret = self.output_dec(X, dec_output_id, dec_output_reg, msk_input)
+        dec_input_energy = []
+        if self.skip_connection:
+            dec_input_energy.append(enc)
+        dec_input_energy += encs_energy
+        dec_output_energy = tf.concat(dec_input_energy, axis=-1)*msk_input
+        if self.debug:
+            debugging_data["dec_output_energy"] = dec_output_energy
+
+        ret = self.output_dec(X, dec_output_id, dec_output_reg, dec_output_energy, msk_input)
 
         if self.debug:
             for k in debugging_data.keys():
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 58610f887..ae8eb89c1 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -238,7 +238,7 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         residual[np.isinf(residual)] = 0
         plt.hist(residual, bins=100)
         plt.xlabel("true - pred")
-        plt.title("{} residual, m={:.2f} s={:.2f}".format(reg_variable, np.mean(residual), np.std(residual)))
+        plt.title("{} residual, m={:.4f} s={:.4f}".format(reg_variable, np.mean(residual), np.std(residual)))
         plt.savefig(str(outpath / "{}_residual{}.png".format(reg_variable, s)), bbox_inches="tight")
         plt.close("all")
 
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 65892dd13..5b0b65331 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -153,7 +153,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 41d0a9f19..54426da53 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -33,7 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"/home/joosep/particleflow/parameters/cms-gnn-dense-dev.yaml\") as f:\n",
+    "with open(\"/home/joosep/particleflow/parameters/cms.yaml\") as f:\n",
     "    config = yaml.load(f)\n",
     "config[\"setup\"][\"multi_output\"] = True\n",
     "config[\"parameters\"][\"debug\"] = True"
@@ -80,7 +80,7 @@
     "ygens = []\n",
     "ycands = []\n",
     "\n",
-    "for fi in dataset_def.val_filelist[:10]:\n",
+    "for fi in dataset_def.val_filelist[:2]:\n",
     "    print(fi)\n",
     "    X, ygen, ycand = dataset_def.prepare_data(fi)\n",
     "\n",
@@ -92,7 +92,8 @@
     "ygen_val = np.concatenate(ygens)\n",
     "ycand_val = np.concatenate(ycands)\n",
     "\n",
-    "X_val, ycand_val, _ = dataset_transform(X_val, ycand_val, None)\n"
+    "X_val, ycand_val, _ = dataset_transform(X_val, ycand_val, None)\n",
+    "X_val, ygen_val, _ = dataset_transform(X_val, ygen_val, None)\n"
    ]
   },
   {
@@ -101,7 +102,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vals_a = X_val[:, :, 4].flatten()"
+    "vals_a = X_val[:, :, 2].flatten()"
    ]
   },
   {
@@ -110,7 +111,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vals_b = ycand_val[\"energy\"][:, :, 0].flatten()"
+    "vals_b = ycand_val[\"eta\"][:, :, 0].flatten()\n",
+    "vals_c = ygen_val[\"eta\"][:, :, 0].flatten()\n",
+    "\n",
+    "cls_cand = np.argmax(ycand_val[\"cls\"], axis=-1).flatten()\n",
+    "cls_gen = np.argmax(ygen_val[\"cls\"], axis=-1).flatten()"
    ]
   },
   {
@@ -119,8 +124,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk =np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==4\n",
-    "plt.scatter(vals_a[msk], vals_b[msk], marker=\".\", alpha=0.2)"
+    "msk = (np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==6) & (np.argmax(ygen_val[\"cls\"], axis=-1).flatten()==6)\n",
+    "plt.scatter(vals_a[msk], vals_c[msk], marker=\".\", alpha=0.2)"
    ]
   },
   {
@@ -149,8 +154,8 @@
    "source": [
     "ret = model(X_val[:1])\n",
     "#model.set_trainable_classification()\n",
-    "model.load_weights(\"/home/joosep/particleflow/experiments/cms-gnn-dense-dev_20210819_101049.joosep-desktop/weights/weights-10-125.094849.hdf5\")\n",
-    "ret = model.predict(X_val, batch_size=1)"
+    "model.load_weights(\"/home/joosep/particleflow/experiments/cms_20210823_110858.joosep-desktop/weights/weights-02-95.751160.hdf5\")\n",
+    "ret = model.predict(X_val, batch_size=1, verbose=1)"
    ]
   },
   {
@@ -169,7 +174,40 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "preds = model(X_val[:1], training=False)"
+    "cls = np.argmax(ret[\"cls\"], axis=-1)\n",
+    "cls_true = np.argmax(ycand_val[\"cls\"], axis=-1)\n",
+    "energy = ret[\"energy\"]\n",
+    "eta = ret[\"eta\"]\n",
+    "energy_true = ycand_val[\"energy\"]\n",
+    "\n",
+    "msk = (cls==4) & (cls_true==4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(energy[msk].flatten()-energy_true[msk].flatten(), bins=100);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_val[msk][:, 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.scatter(eta[msk], energy[msk].flatten(), marker=\".\")"
    ]
   },
   {
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 93bacb32c..70653616e 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -56,7 +56,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 5
+  batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 50
@@ -90,12 +90,12 @@ parameters:
     hidden_dim: 32
     num_layers: 2
     activation: elu
-  num_graph_layers: 3
+  num_graph_layers: 2
   node_message:
     type: NodeMessageLearnable
-    output_dim: 256
+    output_dim: 512
     hidden_dim: 128
-    num_layers: 3
+    num_layers: 2
     activation: elu
     aggregation_direction: dst
   num_node_messages: 1
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 8d9b63307..ad60a998c 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -56,7 +56,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 5
+  batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 50
@@ -88,13 +88,13 @@ parameters:
     type: NodePairGaussianKernel
     dist_mult: 0.1
     clip_value_low: 0.0
-  num_graph_layers: 3
+  num_graph_layers: 2
   node_message:
     type: GHConvDense
-    output_dim: 128
+    output_dim: 512
     activation: elu
     normalize_degrees: yes
-  num_node_messages: 2
+  num_node_messages: 1
   skip_connection: yes
   regression_use_classification: yes
   debug: no
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index e9c27d05f..a679ce39e 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -23,11 +23,9 @@ dataset:
   num_files_per_chunk: 5
   validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
   energy_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   pt_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredLogarithmicError
   sin_phi_loss:
     type: Huber
     delta: 0.1

From d07745cbb5057b2917f3e41bcb78d24b8474e43d Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Mon, 23 Aug 2021 15:50:17 +0300
Subject: [PATCH 076/157] separate energy graph layer

---
 mlpf/tfmodel/model.py | 103 +++++++++++++-----------------------------
 parameters/cms.yaml   |   8 ++--
 2 files changed, 36 insertions(+), 75 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 00d2ac687..0dfe90ea8 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -399,22 +399,22 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         
         self.ffn_pt = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=False,
+            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True,
             dropout=dropout
         )
         self.ffn_eta = point_wise_feed_forward_network(
             2, hidden_dim, "ffn_eta",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
         self.ffn_phi = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_phi",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
         self.ffn_energy = point_wise_feed_forward_network(
             1, hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
         self.ffn_energy_sigmoid = point_wise_feed_forward_network(
@@ -429,11 +429,11 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
     X_encoded_reg: (n_batch, n_elements, n_encoded_features)
     msk_input: (n_batch, n_elements) boolean mask
     """
-    def call(self, X_input, X_encoded_id, X_encoded_reg, X_encoded_energy, msk_input):
+    def call(self, X_input, X_encoded, msk_input):
 
-        out_id_logits = self.ffn_id(X_encoded_id)*msk_input
+        out_id_logits = self.ffn_id(X_encoded)*msk_input
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = self.ffn_charge(X_encoded_id)*msk_input
+        out_charge = self.ffn_charge(X_encoded)*msk_input
 
         #orig_pt = X_input[:, :, 1:2]
         orig_eta = X_input[:, :, 2:3]
@@ -449,10 +449,10 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, X_encoded_energy, msk_input
             orig_energy = X_input[:, :, 5:6]
 
         if self.regression_use_classification:
-            X_encoded_reg = tf.concat([X_encoded_reg, out_id_logits], axis=-1)
+            X_encoded = tf.concat([X_encoded, out_id_logits], axis=-1)
 
-        pred_eta_corr = self.ffn_eta(X_encoded_reg)
-        pred_phi_corr = self.ffn_phi(X_encoded_reg)
+        pred_eta_corr = self.ffn_eta(X_encoded)
+        pred_phi_corr = self.ffn_phi(X_encoded)
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
         pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
@@ -462,11 +462,11 @@ def call(self, X_input, X_encoded_id, X_encoded_reg, X_encoded_energy, msk_input
         pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
         pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
 
-        pred_energy_corr = self.ffn_energy(X_encoded_energy)
-        energy_sigmoid = tf.keras.activations.sigmoid(self.ffn_energy_sigmoid(X_encoded_energy))
+        pred_energy_corr = self.ffn_energy(X_encoded)
+        energy_sigmoid = tf.keras.activations.sigmoid(self.ffn_energy_sigmoid(X_encoded))
         pred_energy = orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*pred_energy_corr[:, :, 0:1]
         
-        pred_pt_corr = self.ffn_pt(X_encoded_energy)
+        pred_pt_corr = self.ffn_pt(X_encoded)
         orig_pt = tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
         pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
         pred_pt = orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2]
@@ -572,6 +572,7 @@ def __init__(self,
         self.activation = activation
         self.focal_loss_from_logits = focal_loss_from_logits
         self.debug = debug
+        self.separate_graph_layers = False
 
         self.skip_connection = skip_connection
 
@@ -583,9 +584,6 @@ def __init__(self,
         elif input_encoding == "default":
             self.enc = InputEncoding(num_input_classes)
 
-        self.ffn_enc_id = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_id", activation=activation)
-        self.ffn_enc_reg = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_reg", activation=activation)
-        self.ffn_enc_energy = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc_energy", activation=activation)
 
         kwargs_cg = {
             "max_num_bins": max_num_bins,
@@ -598,78 +596,41 @@ def __init__(self,
             "node_message": node_message,
             "hidden_dim": hidden_dim
         }
-        self.cg_id = [CombinedGraphLayer(name="cg_id_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
-        self.cg_reg = [CombinedGraphLayer(name="cg_reg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
-        self.cg_energy = [CombinedGraphLayer(name="cg_energy_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
+
+        self.ffn_enc = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc", activation=activation)
+        self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
 
         self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout)
 
     def call(self, inputs, training=False):
         X = inputs
+        debugging_data = {}
 
         #mask padded elements
         msk = X[:, :, 0] != 0
         msk_input = tf.expand_dims(tf.cast(msk, tf.float32), -1)
 
-        enc = self.enc(X)
-        enc_id = self.activation(self.ffn_enc_id(enc))
-        encs_id = []
-
-        debugging_data = {}
-
         #encode the elements for classification (id)
-        for cg in self.cg_id:
-            enc_id_all = cg(enc_id, msk, training)
-            enc_id = enc_id_all["enc"]
-            if self.debug:
-                debugging_data[cg.name] = enc_id_all
-            encs_id.append(enc_id)
+        enc = self.enc(X)
 
-        #encode the elements for regression
-        enc_reg = self.activation(self.ffn_enc_reg(enc))
-        encs_reg = []
-        for cg in self.cg_reg:
-            enc_reg_all = cg(enc_reg, msk, training)
-            enc_reg = enc_reg_all["enc"]
+        enc_cg = self.activation(self.ffn_enc(enc))
+        encs = []
+        for cg in self.cg:
+            enc_all = cg(enc_cg, msk, training)
+            enc_cg = enc_all["enc"]
             if self.debug:
-                debugging_data[cg.name] = enc_reg_all
-            encs_reg.append(enc_reg)
-
-        #encode the elements for energy regression
-        enc_energy = self.activation(self.ffn_enc_energy(enc))
-        encs_energy = []
-        for cg in self.cg_energy:
-            enc_energy_all = cg(enc_energy, msk, training)
-            enc_energy = enc_energy_all["enc"]
-            if self.debug:
-                debugging_data[cg.name] = enc_energy_all
-            encs_energy.append(enc_energy)
-
-        dec_input_cls = []
-        if self.skip_connection:
-            dec_input_cls.append(enc)
-        dec_input_cls += encs_id
-        dec_output_id = tf.concat(dec_input_cls, axis=-1)*msk_input
-        if self.debug:
-            debugging_data["dec_output_id"] = dec_output_id
-
-        dec_input_reg = []
-        if self.skip_connection:
-            dec_input_reg.append(enc)
-        dec_input_reg += encs_reg
-        dec_output_reg = tf.concat(dec_input_reg, axis=-1)*msk_input
-        if self.debug:
-            debugging_data["dec_output_reg"] = dec_output_reg
+                debugging_data[cg.name] = enc_all
+            encs.append(enc_cg)
 
-        dec_input_energy = []
+        dec_input = []
         if self.skip_connection:
-            dec_input_energy.append(enc)
-        dec_input_energy += encs_energy
-        dec_output_energy = tf.concat(dec_input_energy, axis=-1)*msk_input
+            dec_input.append(enc)
+        dec_input += encs
+        dec_output = tf.concat(dec_input, axis=-1)*msk_input
         if self.debug:
-            debugging_data["dec_output_energy"] = dec_output_energy
+            debugging_data["dec_output"] = dec_output
 
-        ret = self.output_dec(X, dec_output_id, dec_output_reg, dec_output_energy, msk_input)
+        ret = self.output_dec(X, dec_output, msk_input)
 
         if self.debug:
             for k in debugging_data.keys():
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index ad60a998c..9d0c26262 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -35,9 +35,9 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
   pt_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -88,10 +88,10 @@ parameters:
     type: NodePairGaussianKernel
     dist_mult: 0.1
     clip_value_low: 0.0
-  num_graph_layers: 2
+  num_graph_layers: 5
   node_message:
     type: GHConvDense
-    output_dim: 512
+    output_dim: 256
     activation: elu
     normalize_degrees: yes
   num_node_messages: 1

From c21735adf7cef1bb54f371dc18c257e8a492c4c7 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Mon, 23 Aug 2021 16:40:16 +0300
Subject: [PATCH 077/157] just use the same cg layers

---
 mlpf/tfmodel/model.py       | 94 +++++++++----------------------------
 mlpf/tfmodel/model_setup.py |  5 +-
 parameters/cms-dev.yaml     |  8 ++--
 parameters/cms.yaml         |  2 +-
 4 files changed, 29 insertions(+), 80 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 0dfe90ea8..96954438e 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -398,27 +398,25 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         )
         
         self.ffn_pt = point_wise_feed_forward_network(
-            4, hidden_dim, "ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True,
+            2, hidden_dim, "ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
+
         self.ffn_eta = point_wise_feed_forward_network(
             2, hidden_dim, "ffn_eta",
             dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
+
         self.ffn_phi = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_phi",
             dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
+
         self.ffn_energy = point_wise_feed_forward_network(
-            1, hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
-            dropout=dropout
-        )
-        self.ffn_energy_sigmoid = point_wise_feed_forward_network(
-            1, hidden_dim, "ffn_energy_sigmoid",
+            2, hidden_dim, "ffn_energy",
             dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
@@ -453,6 +451,8 @@ def call(self, X_input, X_encoded, msk_input):
 
         pred_eta_corr = self.ffn_eta(X_encoded)
         pred_phi_corr = self.ffn_phi(X_encoded)
+        pred_energy_corr = self.ffn_energy(X_encoded)
+        pred_pt_corr = self.ffn_pt(X_encoded)
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
         pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
@@ -462,14 +462,12 @@ def call(self, X_input, X_encoded, msk_input):
         pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
         pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
 
-        pred_energy_corr = self.ffn_energy(X_encoded)
-        energy_sigmoid = tf.keras.activations.sigmoid(self.ffn_energy_sigmoid(X_encoded))
-        pred_energy = orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*pred_energy_corr[:, :, 0:1]
+        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        pred_energy = orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*tf.exp(tf.clip_by_value(pred_energy_corr[:, :, 1:2], -6, 6))
         
-        pred_pt_corr = self.ffn_pt(X_encoded)
         orig_pt = tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
         pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        pred_pt = orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*pred_pt_corr[:, :, 1:2]
+        pred_pt = orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*tf.exp(tf.clip_by_value(pred_pt_corr[:, :, 1:2], -6, 6))
 
         ret = {
             "cls": out_id_softmax,
@@ -483,6 +481,14 @@ def call(self, X_input, X_encoded, msk_input):
 
         return ret
 
+    def set_trainable_named(self, layer_names):
+        self.trainable = True
+
+        for layer in self.layers:
+            layer.trainable = False
+
+        for layer in layer_names:
+            self.get_layer(layer).trainable = True
 
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
@@ -584,7 +590,6 @@ def __init__(self,
         elif input_encoding == "default":
             self.enc = InputEncoding(num_input_classes)
 
-
         kwargs_cg = {
             "max_num_bins": max_num_bins,
             "bin_size": bin_size,
@@ -597,7 +602,6 @@ def __init__(self,
             "hidden_dim": hidden_dim
         }
 
-        self.ffn_enc = point_wise_feed_forward_network(hidden_dim, hidden_dim, "ffn_enc", activation=activation)
         self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
 
         self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout)
@@ -613,7 +617,7 @@ def call(self, inputs, training=False):
         #encode the elements for classification (id)
         enc = self.enc(X)
 
-        enc_cg = self.activation(self.ffn_enc(enc))
+        enc_cg = enc
         encs = []
         for cg in self.cg:
             enc_all = cg(enc_cg, msk, training)
@@ -641,69 +645,13 @@ def call(self, inputs, training=False):
         else:
             return tf.concat([ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1)
 
-    def set_trainable_classification(self):
-        self.trainable = True
-        for layer in self.layers:
-            layer.trainable = True
-
-        self.ffn_enc_reg.trainable = False
-        for cg in self.cg_reg:
-            cg.trainable = False
-        self.ffn_pt.trainable = False
-        self.ffn_eta.trainable = False
-        self.ffn_phi.trainable = False
-        self.ffn_energy.trainable = False
-
-    def set_trainable_regression(self):
-        self.trainable = True
-        for layer in self.layers:
-            layer.trainable = True
-
-        self.ffn_enc_id.trainable = False
-        for cg in self.cg_id:
-            cg.trainable = False
-        self.ffn_id.trainable = False
-        self.ffn_charge.trainable = False
-
     def set_trainable_named(self, layer_names):
         self.trainable = True
 
         for layer in self.layers:
             layer.trainable = False
 
-        for layer in layer_names:
-            self.get_layer(layer).trainable = True
-
-    # def train_step(self, data):
-    #     # Unpack the data. Its structure depends on your model and
-    #     # on what you pass to `fit()`.
-    #     x, y, sample_weights = data
-
-    #     with tf.GradientTape() as tape:
-    #         y_pred = self(x, training=True)  # Forward pass
-    #         # Compute the loss value
-    #         # (the loss function is configured in `compile()`)
-    #         loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
-
-    #     ya = {k: v.numpy() for k, v in y.items()}
-    #     yb = {k: v.numpy() for k, v in y_pred.items()}
-    #     sw = {k: v.numpy() for k, v in sample_weights.items()}
-
-    #     np.savez("ytrue.npz", **ya)
-    #     np.savez("ypred.npz", **yb)
-    #     np.savez("x.npz", x=x)
-    #     np.savez("sample_weights.npz", **sample_weights)
-
-    #     # Compute gradients
-    #     trainable_vars = self.trainable_variables
-    #     gradients = tape.gradient(loss, trainable_vars)
-    #     # Update weights
-    #     self.optimizer.apply_gradients(zip(gradients, trainable_vars))
-    #     # Update metrics (includes the metric that tracks the loss)
-    #     self.compiled_metrics.update_state(y, y_pred)
-    #     # Return a dict mapping metric names to current value
-    #     return {m.name: m.result() for m in self.metrics}
-
+        self.output_dec.set_trainable_named(layer_names)
 
 class DummyNet(tf.keras.Model):
     def __init__(self,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index ae8eb89c1..17f1a723b 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -203,10 +203,11 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
 
         #FIXME: propagate from configuration
         if reg_variable == "energy" or reg_variable == "pt":
-            loss = tf.keras.losses.MeanSquaredLogarithmicError(reduction=tf.keras.losses.Reduction.NONE)
+            delta = 1.0
         else:
             delta = 0.1
-            loss = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
+            
+        loss = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
         loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
 
         #suffix for log-transformed variable
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 70653616e..35331e008 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -35,9 +35,9 @@ dataset:
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
   pt_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -90,10 +90,10 @@ parameters:
     hidden_dim: 32
     num_layers: 2
     activation: elu
-  num_graph_layers: 2
+  num_graph_layers: 6
   node_message:
     type: NodeMessageLearnable
-    output_dim: 512
+    output_dim: 256
     hidden_dim: 128
     num_layers: 2
     activation: elu
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 9d0c26262..6d4e1bb4d 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -88,7 +88,7 @@ parameters:
     type: NodePairGaussianKernel
     dist_mult: 0.1
     clip_value_low: 0.0
-  num_graph_layers: 5
+  num_graph_layers: 6
   node_message:
     type: GHConvDense
     output_dim: 256

From 96dd6474b0e9cd50d5e6ffb6e334746faddfe81e Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 09:39:50 -0700
Subject: [PATCH 078/157] adding code for the optimized knn

---
 mlpf/pytorch_delphes/gravnet_optimized.py  | 122 +++++
 mlpf/pytorch_delphes/model_optimized.py    |  87 ++++
 mlpf/pytorch_delphes/pipeline_optimized.py | 508 +++++++++++++++++++++
 3 files changed, 717 insertions(+)
 create mode 100644 mlpf/pytorch_delphes/gravnet_optimized.py
 create mode 100644 mlpf/pytorch_delphes/model_optimized.py
 create mode 100644 mlpf/pytorch_delphes/pipeline_optimized.py

diff --git a/mlpf/pytorch_delphes/gravnet_optimized.py b/mlpf/pytorch_delphes/gravnet_optimized.py
new file mode 100644
index 000000000..f9df1c354
--- /dev/null
+++ b/mlpf/pytorch_delphes/gravnet_optimized.py
@@ -0,0 +1,122 @@
+#BEFORE OPTIMIZATION
+from typing import Optional, Union
+from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
+import time
+
+import torch
+from torch import Tensor
+from torch.nn import Linear
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+
+try:
+    from torch_cluster import knn ###############USES OLD KNN#########################
+except ImportError:
+    knn = None
+
+# copied it from pytorch_geometric source code
+# ADDED: retrieve edge_index, retrieve edge_weight
+# CHANGED: self.lin -> self.lin_p
+# CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
+# REMOVED: skip connection
+
+class GravNetConv(MessagePassing):
+    r"""The GravNet operator from the `"Learning Representations of Irregular
+    Particle-detector Geometry with Distance-weighted Graph
+    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
+    dynamically constructed using nearest neighbors.
+    The neighbors are constructed in a learnable low-dimensional projection of
+    the feature space.
+    A second projection of the input feature space is then propagated from the
+    neighbors to each vertex using distance weights that are derived by
+    applying a Gaussian function to the distances.
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        space_dimensions (int): The dimensionality of the space used to
+           construct the neighbors; referred to as :math:`S` in the paper.
+        propagate_dimensions (int): The number of features to be propagated
+           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
+           paper.
+        k (int): The number of nearest neighbors.
+        num_workers (int): Number of workers to use for k-NN computation.
+            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
+            lies on the GPU. (default: :obj:`1`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 space_dimensions: int, propagate_dimensions: int, k: int,
+                 num_workers: int = 1, **kwargs):
+        super(GravNetConv, self).__init__(flow='target_to_source', **kwargs)
+
+        if knn is None:
+            raise ImportError('`GravNetConv` requires `torch-cluster`.')
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.k = k
+        self.num_workers = num_workers
+
+        self.lin_s = Linear(in_channels, space_dimensions)
+        self.lin_h = Linear(in_channels, propagate_dimensions)
+        self.lin_p = Linear(propagate_dimensions, out_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_s.reset_parameters()
+        self.lin_h.reset_parameters()
+        self.lin_p.reset_parameters()
+
+
+    def forward(
+            self, x: Union[Tensor, PairTensor],
+            batch: Union[OptTensor, Optional[PairTensor]] = None) -> Tensor:
+        """"""
+
+        is_bipartite: bool = True
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+            is_bipartite = False
+        assert x[0].dim() == 2, 'Static graphs not supported in `GravNetConv`.'
+
+        b: PairOptTensor = (None, None)
+        if isinstance(batch, Tensor):
+            b = (batch, batch)
+        elif isinstance(batch, tuple):
+            assert batch is not None
+            b = (batch[0], batch[1])
+
+        h_l: Tensor = self.lin_h(x[0])
+
+        s_l: Tensor = self.lin_s(x[0])
+        s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
+
+        edge_index = knn(s_l, s_r, self.k, b[0], b[1],
+                         num_workers=self.num_workers)
+
+        edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
+        edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
+
+        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
+        out = self.propagate(edge_index, x=(h_l, None),
+                             edge_weight=edge_weight,
+                             size=(s_l.size(0), s_r.size(0)))
+
+        return self.lin_p(out), edge_index, edge_weight
+
+
+    def message(self, x_j: Tensor, edge_weight: Tensor) -> Tensor:
+        return x_j * edge_weight.unsqueeze(1)
+
+    def aggregate(self, inputs: Tensor, index: Tensor,
+                  dim_size: Optional[int] = None) -> Tensor:
+        out_mean = scatter(inputs, index, dim=self.node_dim, dim_size=dim_size,
+                           reduce='sum')
+        return out_mean
+
+    def __repr__(self):
+        return '{}({}, {}, k={})'.format(self.__class__.__name__,
+                                         self.in_channels, self.out_channels,
+                                         self.k)
diff --git a/mlpf/pytorch_delphes/model_optimized.py b/mlpf/pytorch_delphes/model_optimized.py
new file mode 100644
index 000000000..d20320a2a
--- /dev/null
+++ b/mlpf/pytorch_delphes/model_optimized.py
@@ -0,0 +1,87 @@
+import numpy as np
+import mplhep, time, os
+
+import torch
+import torch_geometric
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
+from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch.utils.data import random_split
+
+#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
+from gravnet_optimized import GravNetConv
+from torch_geometric.nn import GraphConv
+
+#Model with gravnet clustering
+class PFNet7(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen"):
+
+        super(PFNet7, self).__init__()
+
+        self.elu = nn.ELU
+        self.act_f = torch.nn.functional.leaky_relu
+
+        # (1) DNN: encoding/decoding of all tracks and clusters
+        self.nn1 = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+            self.elu(),
+            nn.Linear(hidden_dim_nn1, input_encoding),
+        )
+
+        # (2) CNN: Gravnet layer
+        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+
+        # (4) DNN layer: regressing p4
+        self.nn3 = nn.Sequential(
+            nn.Linear(encoding_dim + output_dim_id + input_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_p4),
+        )
+
+    def forward(self, data):
+        x0 = data.x
+
+        # Encoder/Decoder step
+        x = self.nn1(x0)
+
+        # Gravnet step
+        x, edge_index, edge_weight = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        nn3_input = torch.cat([x, pred_ids, x0], axis=-1)
+        pred_p4 = self.nn3(nn3_input)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
diff --git a/mlpf/pytorch_delphes/pipeline_optimized.py b/mlpf/pytorch_delphes/pipeline_optimized.py
new file mode 100644
index 000000000..674345366
--- /dev/null
+++ b/mlpf/pytorch_delphes/pipeline_optimized.py
@@ -0,0 +1,508 @@
+from glob import glob
+import sys, os
+sys.path.insert(1, '../../plotting/')
+sys.path.insert(1, '../../mlpf/plotting/')
+
+import os.path as osp
+import pickle as pkl
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_geometric.transforms as T
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_scatter import scatter_mean
+from torch_geometric.nn.inits import reset
+from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
+from torch_geometric.nn import GravNetConv
+from torch.utils.data import random_split
+import torch_cluster
+
+import args
+from args import parse_args
+from graph_data_delphes import PFGraphDataset, one_hot_embedding
+from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+from plot_utils import plot_confusion_matrix
+
+import evaluate
+from evaluate import make_plots, make_predictions
+from model_optimized import PFNet7
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        alpha,
+        task,
+        title)
+    return model_fname
+
+def compute_weights(gen_ids_one_hot, device):
+    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
+    weights = torch.zeros(output_dim_id).to(device=device)
+    for k, v in zip(vs, cs):
+        weights[k] = 1.0/math.sqrt(float(v))
+    return weights
+
+def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
+    plt.style.use(hep.style.ROOT)
+
+    if not os.path.exists(outpath + '/training_plots/'):
+        os.makedirs(outpath + '/training_plots/')
+
+    fig, ax = plt.subplots()
+    ax.plot(range(len(l)), l, label=label)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.legend(loc='best')
+    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
+    plt.close(fig)
+
+    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
+        pkl.dump(l, f)
+
+@torch.no_grad()
+def test(model, loader, epoch, alpha, target_type, device):
+    with torch.no_grad():
+        ret = train(model, loader, epoch, None, alpha, target_type, device)
+    return ret
+
+def train(model, loader, epoch, optimizer, alpha, target_type, device):
+
+    is_train = not (optimizer is None)
+
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+
+    #loss values for each batch: classification, regression, total
+    losses_1, losses_2, losses_tot = [], [], []
+
+    #accuracy values for each batch (monitor classification performance)
+    accuracies_batch, accuracies_batch_msk = [], []
+
+    #setup confusion matrix
+    conf_matrix = np.zeros((output_dim_id, output_dim_id))
+
+    # to compute average inference time
+    t=[]
+
+    for i, batch in enumerate(loader):
+        t0 = time.time()
+
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        ## make like tensorflow model, 0-padding events to 6k elements
+        # if X.x.shape[0]<6000:
+        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
+        #
+        #     X.x = new_X
+        #     X.ygen_id=new_ygen_id
+
+        # Forwardprop
+        if i<10:
+            ti = time.time()
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+            tf = time.time()
+            t.append(round((tf-ti),2))
+        else:
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot, -1)
+        _, pred_ids = torch.max(pred_ids_one_hot, -1)
+        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
+
+        # masking
+        msk = ((pred_ids != 0) & (gen_ids != 0))
+        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
+
+        # computing loss
+        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
+        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
+        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
+
+        if args.classification_only:
+            loss = l1
+        else:
+            loss = l1+l2
+
+        if is_train:
+            # BACKPROP
+            #print(list(model.parameters())[1].grad)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        losses_1.append(l1.detach().cpu().item())
+        losses_2.append(l2.detach().cpu().item())
+        losses_tot.append(loss.detach().cpu().item())
+
+        t1 = time.time()
+
+        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
+        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
+
+        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
+                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
+
+        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
+
+    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
+
+    losses_1 = np.mean(losses_1)
+    losses_2 = np.mean(losses_2)
+    losses_tot = np.mean(losses_tot)
+
+    acc = np.mean(accuracies_batch)
+    acc_msk = np.mean(accuracies_batch_msk)
+
+    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
+
+    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
+
+
+def train_loop():
+    t0_initial = time.time()
+
+    losses_1_train, losses_2_train, losses_tot_train = [], [], []
+    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
+
+    accuracies_train, accuracies_msk_train = [], []
+    accuracies_valid, accuracies_msk_valid = [], []
+
+    best_val_loss = 99999.9
+    stale_epochs = 0
+
+    print("Training over {} epochs".format(args.n_epochs))
+    for epoch in range(args.n_epochs):
+        t0 = time.time()
+
+        if stale_epochs > patience:
+            print("breaking due to stale epochs")
+            break
+
+        # training epoch
+        model.train()
+        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
+
+        losses_tot_train.append(losses_tot)
+        losses_1_train.append(losses_1)
+        losses_2_train.append(losses_2)
+
+        accuracies_train.append(acc)
+        accuracies_msk_train.append(acc_msk)
+
+        # validation step
+        model.eval()
+        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
+
+        losses_tot_valid.append(losses_tot_v)
+        losses_1_valid.append(losses_1_v)
+        losses_2_valid.append(losses_2_v)
+
+        accuracies_valid.append(acc_v)
+        accuracies_msk_valid.append(acc_msk_v)
+
+        # early-stopping
+        if losses_tot_v < best_val_loss:
+            best_val_loss = losses_tot_v
+            stale_epochs = 0
+        else:
+            stale_epochs += 1
+
+        t1 = time.time()
+
+        epochs_remaining = args.n_epochs - (epoch+1)
+        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
+        eta = epochs_remaining*time_per_epoch/60
+
+        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
+            epoch+1, args.n_epochs,
+            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
+            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
+
+        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
+
+        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
+        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
+
+        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
+        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
+
+    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
+    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
+    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
+
+    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
+    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
+    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
+
+    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
+    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
+
+    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
+    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
+
+    print('Done with training.')
+
+    return
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
+    # 'load': True, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
+    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    patience = args.patience
+
+    model_classes = {"PFNet7": PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'hidden_dim_nn1': args.hidden_dim_nn1,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'space_dim': args.space_dim,
+                    'propagate_dimensions': args.propagate_dimensions,
+                    'nearest': args.nearest,
+                    'target': args.target,
+                    'nn1': args.nn1,
+                    'nn3': args.nn3}
+
+    if args.load:
+            print('Loading a previously trained model..')
+            model = model_class(**model_kwargs)
+            outpath = args.outpath + args.load_model
+            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+            state_dict = torch.load(PATH, map_location=device)
+
+            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
+                state_dict = torch.load(PATH, map_location=device)
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove module.
+                    new_state_dict[name] = v
+                    # print('name is:', name)
+                state_dict=new_state_dict
+
+            model.load_state_dict(state_dict)
+
+            if multi_gpu:
+                model = torch_geometric.nn.DataParallel(model)
+                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+            model.to(device)
+
+            if args.train:
+                print("Training a previously trained model..")
+
+    elif args.train:
+        #instantiate the model
+        print('Instantiating a model..')
+        model = model_class(**model_kwargs)
+
+        if multi_gpu:
+            print("Parallelizing the training..")
+            model = torch_geometric.nn.DataParallel(model)
+            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+        model.to(device)
+
+    if args.train:
+        args.title=args.title+'noskip'
+        if args.nn1:
+            args.title=args.title+'_nn1'
+        if args.nn3:
+            args.title=args.title+'_nn3'
+
+        if args.classification_only:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
+        else:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
+
+        outpath = osp.join(args.outpath, model_fname)
+        if osp.isdir(outpath):
+            if args.overwrite:
+                print("model output {} already exists, deleting it".format(outpath))
+                import shutil
+                shutil.rmtree(outpath)
+            else:
+                print("model output {} already exists, please delete it".format(outpath))
+                sys.exit(0)
+        try:
+            os.makedirs(outpath)
+        except Exception as e:
+            pass
+
+        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
+            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
+
+        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
+            os.makedirs(outpath + '/confusion_matrix_plots/')
+
+        if args.optimizer == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+        elif args.optimizer == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+
+        print(model)
+        print(model_fname)
+
+        model.train()
+        train_loop()
+
+    model.eval()
+
+    # evaluate on training data..
+    if not osp.isdir(outpath+'/train_loader'):
+        os.makedirs(outpath+'/train_loader')
+    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
+        os.makedirs(outpath+'/train_loader/resolution_plots')
+    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
+        os.makedirs(outpath+'/train_loader/distribution_plots')
+    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/train_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
+        os.makedirs(outpath+'/train_loader/efficiency_plots')
+
+    if args.make_predictions_train:
+        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+    if args.make_plots_train:
+        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+
+    # evaluate on validation data..
+    if not osp.isdir(outpath+'/valid_loader'):
+        os.makedirs(outpath+'/valid_loader')
+    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
+        os.makedirs(outpath+'/valid_loader/resolution_plots')
+    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
+        os.makedirs(outpath+'/valid_loader/distribution_plots')
+    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
+        os.makedirs(outpath+'/valid_loader/efficiency_plots')
+
+    if args.make_predictions_valid:
+        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+    if args.make_plots_valid:
+        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+
+    # evaluate on testing data..
+    if not osp.isdir(outpath+'/test_loader'):
+        os.makedirs(outpath+'/test_loader')
+    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
+        os.makedirs(outpath+'/test_loader/resolution_plots')
+    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
+        os.makedirs(outpath+'/test_loader/distribution_plots')
+    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/test_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
+        os.makedirs(outpath+'/test_loader/efficiency_plots')
+
+    if args.make_predictions_test:
+        if args.load:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+    if args.make_plots_test:
+        if args.load:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+
+
+## -----------------------------------------------------------
+# to retrieve a stored variable in pkl file
+# import pickle as pkl
+# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     a = pkl.load(f)
+#
+# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
+#     data = pkl.load(pickle_file)
+#
+# data.keys()

From 38e6f8d25fc0641e3da342ce0f2422d6c2b056ad Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 11:10:45 -0700
Subject: [PATCH 079/157] fixing optimized model definition

---
 mlpf/pytorch_delphes/model_optimized.py | 56 +++++++++++++++----------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/mlpf/pytorch_delphes/model_optimized.py b/mlpf/pytorch_delphes/model_optimized.py
index d20320a2a..c4dc9c06c 100644
--- a/mlpf/pytorch_delphes/model_optimized.py
+++ b/mlpf/pytorch_delphes/model_optimized.py
@@ -26,21 +26,28 @@ def __init__(self,
         output_dim_id=6,
         output_dim_p4=6,
         space_dim=4, propagate_dimensions=22, nearest=16,
-        target="gen"):
+        target="gen", nn1=True, nn3=True):
 
         super(PFNet7, self).__init__()
 
-        self.elu = nn.ELU
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+
+        self.act = nn.LeakyReLU
         self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+        self.elu = nn.ELU
 
         # (1) DNN: encoding/decoding of all tracks and clusters
-        self.nn1 = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim_nn1),
-            self.elu(),
-            nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-            self.elu(),
-            nn.Linear(hidden_dim_nn1, input_encoding),
-        )
+        if self.nn1:
+            self.nn1 = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding),
+            )
 
         # (2) CNN: Gravnet layer
         self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
@@ -57,21 +64,25 @@ def __init__(self,
         )
 
         # (4) DNN layer: regressing p4
-        self.nn3 = nn.Sequential(
-            nn.Linear(encoding_dim + output_dim_id + input_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, output_dim_p4),
-        )
+        if self.nn3:
+            self.nn3 = nn.Sequential(
+                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, output_dim_p4),
+            )
 
     def forward(self, data):
         x0 = data.x
 
         # Encoder/Decoder step
-        x = self.nn1(x0)
+        if self.nn1:
+            x = self.nn1(x0)
+        else:
+            x=x0
 
         # Gravnet step
         x, edge_index, edge_weight = self.conv1(x)
@@ -81,7 +92,10 @@ def forward(self, data):
         pred_ids = self.nn2(x)
 
         # DNN to predict p4
-        nn3_input = torch.cat([x, pred_ids, x0], axis=-1)
-        pred_p4 = self.nn3(nn3_input)
+        if self.nn3:
+            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4 = torch.zeros_like(data.ycand)
 
         return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand

From 0ac649c8a3baf94c7c6183bd738e79d7bc4217b9 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 12:20:27 -0700
Subject: [PATCH 080/157] testing average inference time more precisely

---
 mlpf/pytorch_delphes/pipeline.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlpf/pytorch_delphes/pipeline.py b/mlpf/pytorch_delphes/pipeline.py
index e7d81b78b..a876b39df 100644
--- a/mlpf/pytorch_delphes/pipeline.py
+++ b/mlpf/pytorch_delphes/pipeline.py
@@ -152,11 +152,12 @@ def train(model, loader, epoch, optimizer, alpha, target_type, device):
         #     X.ygen_id=new_ygen_id
 
         # Forwardprop
-        if i<10:
+        if i<100:
             ti = time.time()
             pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
             tf = time.time()
-            t.append(round((tf-ti),2))
+            if i!=0:
+                t.append(round((tf-ti),2))
         else:
             pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
 
@@ -199,7 +200,7 @@ def train(model, loader, epoch, optimizer, alpha, target_type, device):
 
         print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
 
-    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
+    print("Average Inference time per event is: ", round((sum(t) / len(t)),2), 's')
 
     losses_1 = np.mean(losses_1)
     losses_2 = np.mean(losses_2)

From 44c104372ee98021c492589fb3007da2efc03dff Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 12:22:37 -0700
Subject: [PATCH 081/157] test optimized knn inference time

---
 mlpf/pytorch_delphes/pipeline_optimized.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlpf/pytorch_delphes/pipeline_optimized.py b/mlpf/pytorch_delphes/pipeline_optimized.py
index 674345366..f1d5ffbb4 100644
--- a/mlpf/pytorch_delphes/pipeline_optimized.py
+++ b/mlpf/pytorch_delphes/pipeline_optimized.py
@@ -152,11 +152,12 @@ def train(model, loader, epoch, optimizer, alpha, target_type, device):
         #     X.ygen_id=new_ygen_id
 
         # Forwardprop
-        if i<10:
+        if i<100:
             ti = time.time()
             pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
             tf = time.time()
-            t.append(round((tf-ti),2))
+            if i!=0:
+                t.append(round((tf-ti),2))
         else:
             pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
 
@@ -199,7 +200,7 @@ def train(model, loader, epoch, optimizer, alpha, target_type, device):
 
         print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
 
-    print("Average Inference time is: ", round((sum(t) / len(t)),2), 'min')
+    print("Average Inference time per event is: ", round((sum(t) / len(t)),2), 's')
 
     losses_1 = np.mean(losses_1)
     losses_2 = np.mean(losses_2)

From 44a9c645c6f666e3a7752d963226596068b2da9c Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 14:51:47 -0700
Subject: [PATCH 082/157] organized the pipeline

---
 mlpf/plotting/__init__.py                   |   2 +
 mlpf/{pytorch_delphes => plotting}/plots.py | 206 +++++++-
 mlpf/plotting/plots_delphes.py              |   0
 mlpf/pytorch_delphes/LRP/main_reg.py        |  34 +-
 mlpf/pytorch_delphes/__init__.py            |  11 +
 mlpf/pytorch_delphes/args.py                |   1 +
 mlpf/pytorch_delphes/data_preprocessing.py  |   2 -
 mlpf/pytorch_delphes/evaluate.py            | 244 +---------
 mlpf/pytorch_delphes/graph_data_delphes.py  |   5 -
 mlpf/pytorch_delphes/gravnet_optimized.py   |  18 +-
 mlpf/pytorch_delphes/model.py               |  85 +++-
 mlpf/pytorch_delphes/model_optimized.py     | 101 ----
 mlpf/pytorch_delphes/pipeline.py            | 509 --------------------
 mlpf/pytorch_delphes/pipeline_optimized.py  | 509 --------------------
 mlpf/pytorch_delphes/training.py            | 231 +++++++++
 mlpf/pytorch_pipeline.py                    | 279 +++++++++++
 scripts/local_test_cms.sh                   |  39 --
 scripts/local_test_cms_pytorch.sh           |  32 ++
 18 files changed, 870 insertions(+), 1438 deletions(-)
 create mode 100644 mlpf/plotting/__init__.py
 rename mlpf/{pytorch_delphes => plotting}/plots.py (64%)
 mode change 100755 => 100644 mlpf/plotting/plots_delphes.py
 create mode 100644 mlpf/pytorch_delphes/__init__.py
 delete mode 100644 mlpf/pytorch_delphes/model_optimized.py
 delete mode 100644 mlpf/pytorch_delphes/pipeline.py
 delete mode 100644 mlpf/pytorch_delphes/pipeline_optimized.py
 create mode 100644 mlpf/pytorch_delphes/training.py
 create mode 100644 mlpf/pytorch_pipeline.py
 delete mode 100755 scripts/local_test_cms.sh
 create mode 100755 scripts/local_test_cms_pytorch.sh

diff --git a/mlpf/plotting/__init__.py b/mlpf/plotting/__init__.py
new file mode 100644
index 000000000..61a447ad4
--- /dev/null
+++ b/mlpf/plotting/__init__.py
@@ -0,0 +1,2 @@
+from plotting.plot_utils import plot_confusion_matrix
+from plotting.plots import make_plots, plot_regression, plot_distributions_pid, plot_distributions_all, plot_pt_eta, plot_num_particles_pid, draw_efficiency_fakerate, get_eff, get_fake, plot_reso
diff --git a/mlpf/pytorch_delphes/plots.py b/mlpf/plotting/plots.py
similarity index 64%
rename from mlpf/pytorch_delphes/plots.py
rename to mlpf/plotting/plots.py
index 56e4c55e3..09488d247 100644
--- a/mlpf/pytorch_delphes/plots.py
+++ b/mlpf/plotting/plots.py
@@ -1,5 +1,3 @@
-import args
-from args import parse_args
 import sklearn
 import sklearn.metrics
 import numpy as np
@@ -9,8 +7,6 @@
 
 import sys
 import os.path as osp
-sys.path.insert(1, '../../plotting/')
-sys.path.insert(1, '../../mlpf/plotting/')
 
 import torch
 import torch_geometric
@@ -30,6 +26,8 @@
 import mpl_toolkits
 import mplhep as hep
 
+import plotting
+
 plt.style.use(hep.style.ROOT)
 
 elem_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
@@ -530,3 +528,203 @@ def plot_reso(ygen, ypred, ycand, pid, var, rng, ax=None, legend_title=""):
     ax.set_yscale("log")
 
     return {"dpf": res_dpf, "mlpf": res_mlpf}
+
+def make_plots(model, test_loader, outpath, target, device, epoch, which_data):
+
+    print('Making plots on ' + which_data)
+    t0=time.time()
+
+    # load the necessary predictions to make the plots
+    gen_ids = torch.load(outpath + f'/gen_ids.pt', map_location=device)
+    gen_p4 = torch.load(outpath + f'/gen_p4.pt', map_location=device)
+    pred_ids = torch.load(outpath + f'/pred_ids.pt', map_location=device)
+    pred_p4 = torch.load(outpath + f'/pred_p4.pt', map_location=device)
+    cand_ids = torch.load(outpath + f'/cand_ids.pt', map_location=device)
+    cand_p4 = torch.load(outpath + f'/cand_p4.pt', map_location=device)
+
+    list_for_multiplicities = torch.load(outpath + f'/list_for_multiplicities.pt', map_location=device)
+
+    predictions = torch.load(outpath + f'/predictions.pt', map_location=device)
+
+    # reformat a bit
+    ygen = predictions["ygen"].reshape(-1,7)
+    ypred = predictions["ypred"].reshape(-1,7)
+    ycand = predictions["ycand"].reshape(-1,7)
+
+    # make confusion matrix for MLPF
+    conf_matrix_mlpf = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
+                                    pred_ids.cpu(), labels=range(6), normalize="true")
+
+    plotting.plot_confusion_matrix(conf_matrix_mlpf, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_mlpf' + str(epoch), epoch=epoch)
+    torch.save(conf_matrix_mlpf, outpath + '/conf_matrix_mlpf' + str(epoch) + '.pt')
+
+    # make confusion matrix for rule based PF
+    conf_matrix_cand = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
+                                    cand_ids.cpu(), labels=range(6), normalize="true")
+
+    plotting.plot_confusion_matrix(conf_matrix_cand, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_cand' + str(epoch), epoch=epoch)
+    torch.save(conf_matrix_cand, outpath + '/conf_matrix_cand' + str(epoch) + '.pt')
+
+    # making all the other plots
+    if 'test' in which_data:
+        sample = "QCD, 14 TeV, PU200"
+    else:
+        sample = "$t\\bar{t}$, 14 TeV, PU200"
+
+    # make distribution plots
+    plot_distributions_pid(1, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for chhadrons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(2, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for nhadrons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(3, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for photons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(4, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for electrons
+                target, epoch, outpath, legend_title=sample+"\n")
+    plot_distributions_pid(5, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for muons
+                target, epoch, outpath, legend_title=sample+"\n")
+
+    plot_distributions_all(gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for all together
+                target, epoch, outpath, legend_title=sample+"\n")
+
+    # make pt, eta plots to visualize dataset
+    ax, _ = plot_pt_eta(ygen)
+    plt.savefig(outpath+"/gen_pt_eta.png", bbox_inches="tight")
+
+    # plot particle multiplicity plots
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_null = plot_num_particles_pid(list_for_multiplicities, "null", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_null.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_chhad = plot_num_particles_pid(list_for_multiplicities, "chhadron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_chhadron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_nhad = plot_num_particles_pid(list_for_multiplicities, "nhadron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_nhadron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_photon = plot_num_particles_pid(list_for_multiplicities, "photon", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_photon.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_electron = plot_num_particles_pid(list_for_multiplicities, "electron", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_electron.png", bbox_inches="tight")
+    plt.close(fig)
+
+    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
+    ret_num_particles_muon = plot_num_particles_pid(list_for_multiplicities, "muon", ax)
+    plt.savefig(outpath+"/multiplicity_plots/num_muon.png", bbox_inches="tight")
+    plt.close(fig)
+
+    # make efficiency and fake rate plots for charged hadrons
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_pt.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_eta.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid1_energy.png", both=True, legend_title=sample+"\n")
+
+    # make efficiency and fake rate plots for neutral hadrons
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_pt.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_eta.png", both=True, legend_title=sample+"\n")
+    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid2_energy.png", both=True, legend_title=sample+"\n")
+
+    # make resolution plots for chhadrons: pid=1
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_pt = plot_reso(ygen, ypred, ycand, 1, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_eta = plot_reso(ygen, ypred, ycand, 1, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_chhad_E = plot_reso(ygen, ypred, ycand, 1, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid1_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for nhadrons: pid=2
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_pt = plot_reso(ygen, ypred, ycand, 2, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_eta = plot_reso(ygen, ypred, ycand, 2, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_nhad_E = plotting.plot_reso(ygen, ypred, ycand, 2, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid2_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for photons: pid=3
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_pt = plot_reso(ygen, ypred, ycand, 3, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_eta = plot_reso(ygen, ypred, ycand, 3, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_photon_E = plot_reso(ygen, ypred, ycand, 3, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid3_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for electrons: pid=4
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_pt = plot_reso(ygen, ypred, ycand, 4, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_eta = plot_reso(ygen, ypred, ycand, 4, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_electron_E = plot_reso(ygen, ypred, ycand, 4, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid4_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    # make resolution plots for muons: pid=5
+    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_pt = plot_reso(ygen, ypred, ycand, 5, "pt", 2, ax=ax1, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_pt.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_eta = plot_reso(ygen, ypred, ycand, 5, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_eta.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
+    res_muon_E = plot_reso(ygen, ypred, ycand, 5, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
+    plt.savefig(outpath+"/resolution_plots/res_pid5_energy.png", bbox_inches="tight")
+    plt.tight_layout()
+    plt.close(fig)
+
+    t1=time.time()
+    print('Time taken to make plots is:', round(((t1-t0)/60),2), 'min')
diff --git a/mlpf/plotting/plots_delphes.py b/mlpf/plotting/plots_delphes.py
old mode 100755
new mode 100644
diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
index b1816ca39..06ae33e82 100644
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ b/mlpf/pytorch_delphes/LRP/main_reg.py
@@ -65,9 +65,6 @@
 
 from model_io import model_io
 
-import networkx as nx
-from torch_geometric.utils.convert import to_networkx
-
 # NOTE: this script works by loading an already trained model
 
 #Get a unique directory name for the model
@@ -176,6 +173,8 @@ def make_heatmaps(big_list, to_explain, task):
         output_dim = output_dim_id
 
     for pid in range(output_dim_id):
+        if pid!=1:
+            continue
         for node_i in range(len(list[pid])): # iterate over the nodes in each list
             print('- making heatmap for', map_index_to_pid(pid), 'node #:', node_i+1, '/', len(list[pid]))
             for output_neuron in range(output_dim):
@@ -238,22 +237,23 @@ def get_type(t):
                     plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
                 plt.close(fig)
 
+
 if __name__ == "__main__":
 
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'LRP_dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'LRP_dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
-    # 'LRP_outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
-    # 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
-    # 'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'explain': False, 'make_heatmaps_clf': True,'make_heatmaps_reg': True,
-    # 'clf': True, 'reg': True})
+    # args = parse_args()
+
+    # the next part initializes some args values (to run the script not from terminal)
+    class objectview(object):
+        def __init__(self, d):
+            self.__dict__ = d
+
+    args = objectview({'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
+    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'LRP_dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'LRP_dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
+    'LRP_outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
+    'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
+    'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    'explain': False, 'make_heatmaps_clf': True,'make_heatmaps_reg': False,
+    'clf': True, 'reg': False})
 
     # define the dataset (assumes the data exists as .pt files in "processed")
     print('Processing the data..')
diff --git a/mlpf/pytorch_delphes/__init__.py b/mlpf/pytorch_delphes/__init__.py
new file mode 100644
index 000000000..99f5565a9
--- /dev/null
+++ b/mlpf/pytorch_delphes/__init__.py
@@ -0,0 +1,11 @@
+from pytorch_delphes.args import parse_args
+from pytorch_delphes.graph_data_delphes import PFGraphDataset, one_hot_embedding
+from pytorch_delphes.data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
+
+from pytorch_delphes.model import PFNet7, PFNet7_opt
+from pytorch_delphes.gravnet import GravNetConv
+
+from pytorch_delphes.gravnet_optimized import GravNetConv_optimized
+
+from pytorch_delphes.training import train_loop
+from pytorch_delphes.evaluate import make_predictions
diff --git a/mlpf/pytorch_delphes/args.py b/mlpf/pytorch_delphes/args.py
index ecfadd151..12ec28be4 100644
--- a/mlpf/pytorch_delphes/args.py
+++ b/mlpf/pytorch_delphes/args.py
@@ -47,6 +47,7 @@ def parse_args():
     parser.add_argument("--nn0track", action=BoolArg, default=False, help="Adds an initial network that encode the tracks..")
     parser.add_argument("--nn0cluster", action=BoolArg, default=False, help="Adds an initial network that encode the clusters..")
     parser.add_argument("--title", type=str, default='', help="Appends this title to the model's name")
+    parser.add_argument("--optimized", action=BoolArg, default=False, help="Uses the optimized version of knn")
 
     # for evaluation: making predictions & making plots
     parser.add_argument("--make_predictions_train", action=BoolArg, default=False, help="make predictions on training data..")
diff --git a/mlpf/pytorch_delphes/data_preprocessing.py b/mlpf/pytorch_delphes/data_preprocessing.py
index 6e414f690..342456cd7 100644
--- a/mlpf/pytorch_delphes/data_preprocessing.py
+++ b/mlpf/pytorch_delphes/data_preprocessing.py
@@ -50,5 +50,3 @@ def data_to_loader_qcd(full_dataset, n_test, batch_size):
         test_loader = DataListLoader(test_data, batch_size=batch_size, shuffle=True)
 
     return test_loader
-
-#----------------------------------------------------------------------------------------
diff --git a/mlpf/pytorch_delphes/evaluate.py b/mlpf/pytorch_delphes/evaluate.py
index 2c6ced57c..c5273c733 100644
--- a/mlpf/pytorch_delphes/evaluate.py
+++ b/mlpf/pytorch_delphes/evaluate.py
@@ -1,44 +1,19 @@
-import args
-from args import parse_args
-import sklearn
-import sklearn.metrics
-import numpy as np
-import pandas, mplhep
 import pickle as pkl
-import time, math
-
-import sys
-import os.path as osp
-sys.path.insert(1, '../../plotting/')
-sys.path.insert(1, '../../mlpf/plotting/')
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
 import matplotlib
+matplotlib.use("Agg")
 import matplotlib.pyplot as plt
-import mpl_toolkits
 import mplhep as hep
-plt.style.use(hep.style.ROOT)
-
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
 
-from plot_utils import plot_confusion_matrix
-from plots import plot_regression, plot_distributions_pid, plot_distributions_all, plot_pt_eta, plot_num_particles_pid, draw_efficiency_fakerate, get_eff, get_fake, plot_reso
+import torch
 
+import pytorch_delphes
 
-def make_predictions(model, test_loader, outpath, target, device, epoch, which_data):
+def make_predictions(model, multi_gpu, test_loader, outpath, target, device, epoch, which_data):
 
     print('Making predictions on ' + which_data)
     t0=time.time()
@@ -136,204 +111,3 @@ def make_predictions(model, test_loader, outpath, target, device, epoch, which_d
     predictions = {"ygen":ygen.reshape(1,-1,7).detach().cpu().numpy(), "ycand":ycand.reshape(1,-1,7).detach().cpu().numpy(), "ypred":ypred.detach().reshape(1,-1,7).cpu().numpy()}
 
     torch.save(predictions, outpath + '/predictions.pt')
-
-
-def make_plots(model, test_loader, outpath, target, device, epoch, which_data):
-
-    print('Making plots on ' + which_data)
-    t0=time.time()
-
-    # load the necessary predictions to make the plots
-    gen_ids = torch.load(outpath + f'/gen_ids.pt', map_location=device)
-    gen_p4 = torch.load(outpath + f'/gen_p4.pt', map_location=device)
-    pred_ids = torch.load(outpath + f'/pred_ids.pt', map_location=device)
-    pred_p4 = torch.load(outpath + f'/pred_p4.pt', map_location=device)
-    cand_ids = torch.load(outpath + f'/cand_ids.pt', map_location=device)
-    cand_p4 = torch.load(outpath + f'/cand_p4.pt', map_location=device)
-
-    list_for_multiplicities = torch.load(outpath + f'/list_for_multiplicities.pt', map_location=device)
-
-    predictions = torch.load(outpath + f'/predictions.pt', map_location=device)
-
-    # reformat a bit
-    ygen = predictions["ygen"].reshape(-1,7)
-    ypred = predictions["ypred"].reshape(-1,7)
-    ycand = predictions["ycand"].reshape(-1,7)
-
-    # make confusion matrix for MLPF
-    conf_matrix_mlpf = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
-                                    pred_ids.cpu(), labels=range(6), normalize="true")
-
-    plot_confusion_matrix(conf_matrix_mlpf, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_mlpf' + str(epoch), epoch=epoch)
-    torch.save(conf_matrix_mlpf, outpath + '/conf_matrix_mlpf' + str(epoch) + '.pt')
-
-    # make confusion matrix for rule based PF
-    conf_matrix_cand = sklearn.metrics.confusion_matrix(gen_ids.cpu(),
-                                    cand_ids.cpu(), labels=range(6), normalize="true")
-
-    plot_confusion_matrix(conf_matrix_cand, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/conf_matrix_cand' + str(epoch), epoch=epoch)
-    torch.save(conf_matrix_cand, outpath + '/conf_matrix_cand' + str(epoch) + '.pt')
-
-    # making all the other plots
-    if 'test' in which_data:
-        sample = "QCD, 14 TeV, PU200"
-    else:
-        sample = "$t\\bar{t}$, 14 TeV, PU200"
-
-    # make distribution plots
-    plot_distributions_pid(1, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for chhadrons
-                target, epoch, outpath, legend_title=sample+"\n")
-    plot_distributions_pid(2, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for nhadrons
-                target, epoch, outpath, legend_title=sample+"\n")
-    plot_distributions_pid(3, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for photons
-                target, epoch, outpath, legend_title=sample+"\n")
-    plot_distributions_pid(4, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for electrons
-                target, epoch, outpath, legend_title=sample+"\n")
-    plot_distributions_pid(5, gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for muons
-                target, epoch, outpath, legend_title=sample+"\n")
-
-    plot_distributions_all(gen_ids, gen_p4, pred_ids, pred_p4, cand_ids, cand_p4,    # distribution plots for all together
-                target, epoch, outpath, legend_title=sample+"\n")
-
-    # make pt, eta plots to visualize dataset
-    ax, _ = plot_pt_eta(ygen)
-    plt.savefig(outpath+"/gen_pt_eta.png", bbox_inches="tight")
-
-    # plot particle multiplicity plots
-    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
-    ret_num_particles_null = plot_num_particles_pid(list_for_multiplicities, "null", ax)
-    plt.savefig(outpath+"/multiplicity_plots/num_null.png", bbox_inches="tight")
-    plt.close(fig)
-
-    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
-    ret_num_particles_chhad = plot_num_particles_pid(list_for_multiplicities, "chhadron", ax)
-    plt.savefig(outpath+"/multiplicity_plots/num_chhadron.png", bbox_inches="tight")
-    plt.close(fig)
-
-    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
-    ret_num_particles_nhad = plot_num_particles_pid(list_for_multiplicities, "nhadron", ax)
-    plt.savefig(outpath+"/multiplicity_plots/num_nhadron.png", bbox_inches="tight")
-    plt.close(fig)
-
-    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
-    ret_num_particles_photon = plot_num_particles_pid(list_for_multiplicities, "photon", ax)
-    plt.savefig(outpath+"/multiplicity_plots/num_photon.png", bbox_inches="tight")
-    plt.close(fig)
-
-    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
-    ret_num_particles_electron = plot_num_particles_pid(list_for_multiplicities, "electron", ax)
-    plt.savefig(outpath+"/multiplicity_plots/num_electron.png", bbox_inches="tight")
-    plt.close(fig)
-
-    fig, ax = plt.subplots(1, 1, figsize=(8, 2*8))
-    ret_num_particles_muon = plot_num_particles_pid(list_for_multiplicities, "muon", ax)
-    plt.savefig(outpath+"/multiplicity_plots/num_muon.png", bbox_inches="tight")
-    plt.close(fig)
-
-    # make efficiency and fake rate plots for charged hadrons
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_pt.png", both=True, legend_title=sample+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid1_eta.png", both=True, legend_title=sample+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 1, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid1_energy.png", both=True, legend_title=sample+"\n")
-
-    # make efficiency and fake rate plots for neutral hadrons
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "pt", np.linspace(0, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_pt.png", both=True, legend_title=sample+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "eta", np.linspace(-3, 3, 61), outpath+"/efficiency_plots/eff_fake_pid2_eta.png", both=True, legend_title=sample+"\n")
-    ax, _ = draw_efficiency_fakerate(ygen, ypred, ycand, 2, "energy", np.linspace(0, 50, 75), outpath+"/efficiency_plots/eff_fake_pid2_energy.png", both=True, legend_title=sample+"\n")
-
-    # make resolution plots for chhadrons: pid=1
-    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_chhad_pt = plot_reso(ygen, ypred, ycand, 1, "pt", 2, ax=ax1, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid1_pt.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_chhad_eta = plot_reso(ygen, ypred, ycand, 1, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid1_eta.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_chhad_E = plot_reso(ygen, ypred, ycand, 1, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid1_energy.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    # make resolution plots for nhadrons: pid=2
-    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_nhad_pt = plot_reso(ygen, ypred, ycand, 2, "pt", 2, ax=ax1, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid2_pt.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_nhad_eta = plot_reso(ygen, ypred, ycand, 2, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid2_eta.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_nhad_E = plot_reso(ygen, ypred, ycand, 2, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid2_energy.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    # make resolution plots for photons: pid=3
-    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_photon_pt = plot_reso(ygen, ypred, ycand, 3, "pt", 2, ax=ax1, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid3_pt.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_photon_eta = plot_reso(ygen, ypred, ycand, 3, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid3_eta.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_photon_E = plot_reso(ygen, ypred, ycand, 3, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid3_energy.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    # make resolution plots for electrons: pid=4
-    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_electron_pt = plot_reso(ygen, ypred, ycand, 4, "pt", 2, ax=ax1, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid4_pt.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_electron_eta = plot_reso(ygen, ypred, ycand, 4, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid4_eta.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_electron_E = plot_reso(ygen, ypred, ycand, 4, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid4_energy.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    # make resolution plots for muons: pid=5
-    fig, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
-    res_muon_pt = plot_reso(ygen, ypred, ycand, 5, "pt", 2, ax=ax1, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid5_pt.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax2) = plt.subplots(1, 1, figsize=(8, 8))
-    res_muon_eta = plot_reso(ygen, ypred, ycand, 5, "eta", 0.2, ax=ax2, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid5_eta.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    fig, (ax3) = plt.subplots(1, 1, figsize=(8, 8))
-    res_muon_E = plot_reso(ygen, ypred, ycand, 5, "energy", 0.2, ax=ax3, legend_title=sample+"\n")
-    plt.savefig(outpath+"/resolution_plots/res_pid5_energy.png", bbox_inches="tight")
-    plt.tight_layout()
-    plt.close(fig)
-
-    t1=time.time()
-    print('Time taken to make plots is:', round(((t1-t0)/60),2), 'min')
diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
index f47d54dbc..93a533b00 100644
--- a/mlpf/pytorch_delphes/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -8,13 +8,8 @@
 from glob import glob
 
 import pickle
-import scipy
-import scipy.sparse
-import math
 import multiprocessing
 
-import args
-from args import parse_args
 # assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
 # they are processed and saved as pt files in /test_tmp_delphes/data/pythia8_ttbar/processed
 # PFGraphDataset -> returns for 1 event: Data(x=[5139, 12], ycand=[5139, 6], ycand_id=[5139, 6], ygen=[5139, 6], ygen_id=[5139, 6])
diff --git a/mlpf/pytorch_delphes/gravnet_optimized.py b/mlpf/pytorch_delphes/gravnet_optimized.py
index f9df1c354..d0720523c 100644
--- a/mlpf/pytorch_delphes/gravnet_optimized.py
+++ b/mlpf/pytorch_delphes/gravnet_optimized.py
@@ -1,4 +1,3 @@
-#BEFORE OPTIMIZATION
 from typing import Optional, Union
 from torch_geometric.typing import OptTensor, PairTensor, PairOptTensor
 import time
@@ -10,9 +9,9 @@
 from torch_geometric.nn.conv import MessagePassing
 
 try:
-    from torch_cluster import knn ###############USES OLD KNN#########################
+    from torch_cmspepr import knn_graph ###########remember to do pip intsall .###############
 except ImportError:
-    knn = None
+    knn_graph = None
 
 # copied it from pytorch_geometric source code
 # ADDED: retrieve edge_index, retrieve edge_weight
@@ -20,7 +19,7 @@
 # CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
 # REMOVED: skip connection
 
-class GravNetConv(MessagePassing):
+class GravNetConv_optimized(MessagePassing):
     r"""The GravNet operator from the `"Learning Representations of Irregular
     Particle-detector Geometry with Distance-weighted Graph
     Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
@@ -30,6 +29,7 @@ class GravNetConv(MessagePassing):
     A second projection of the input feature space is then propagated from the
     neighbors to each vertex using distance weights that are derived by
     applying a Gaussian function to the distances.
+    
     Args:
         in_channels (int): The number of input channels.
         out_channels (int): The number of output channels.
@@ -48,10 +48,10 @@ class GravNetConv(MessagePassing):
     def __init__(self, in_channels: int, out_channels: int,
                  space_dimensions: int, propagate_dimensions: int, k: int,
                  num_workers: int = 1, **kwargs):
-        super(GravNetConv, self).__init__(flow='target_to_source', **kwargs)
+        super(GravNetConv_optimized, self).__init__(flow='target_to_source', **kwargs)
 
-        if knn is None:
-            raise ImportError('`GravNetConv` requires `torch-cluster`.')
+        if knn_graph is None:
+            raise ImportError('`GravNetConv_optimized` requires `torch_cmspepr`.')
 
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -92,9 +92,7 @@ def forward(
 
         s_l: Tensor = self.lin_s(x[0])
         s_r: Tensor = self.lin_s(x[1]) if is_bipartite else s_l
-
-        edge_index = knn(s_l, s_r, self.k, b[0], b[1],
-                         num_workers=self.num_workers)
+        edge_index = knn_graph(s_l, self.k, b[0])##########################CHANGED###################################
 
         edge_weight = (s_l[edge_index[1]] - s_r[edge_index[0]]).pow(2).sum(-1)
         edge_weight = torch.exp(-10. * edge_weight)  # 10 gives a better spread
diff --git a/mlpf/pytorch_delphes/model.py b/mlpf/pytorch_delphes/model.py
index 44814f32b..608db6eae 100644
--- a/mlpf/pytorch_delphes/model.py
+++ b/mlpf/pytorch_delphes/model.py
@@ -15,9 +15,7 @@
 from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
 from torch.utils.data import random_split
 
-#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
-from gravnet import GravNetConv
-from torch_geometric.nn import GraphConv
+import pytorch_delphes
 
 #Model with gravnet clustering
 class PFNet7(nn.Module):
@@ -48,10 +46,8 @@ def __init__(self,
                 self.elu(),
                 nn.Linear(hidden_dim_nn1, input_encoding),
             )
-
         # (2) CNN: Gravnet layer
-        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
-
+        self.conv1 = pytorch_delphes.GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
         # (3) DNN layer: classifying PID
         self.nn2 = nn.Sequential(
             nn.Linear(encoding_dim, hidden_dim),
@@ -62,7 +58,6 @@ def __init__(self,
             self.elu(),
             nn.Linear(hidden_dim, output_dim_id),
         )
-
         # (4) DNN layer: regressing p4
         if self.nn3:
             self.nn3 = nn.Sequential(
@@ -74,7 +69,83 @@ def __init__(self,
                 self.elu(),
                 nn.Linear(hidden_dim, output_dim_p4),
             )
+    def forward(self, data):
+        x0 = data.x
+
+        # Encoder/Decoder step
+        if self.nn1:
+            x = self.nn1(x0)
+        else:
+            x=x0
+
+        # Gravnet step
+        x, edge_index, edge_weight = self.conv1(x)
+        x = self.act_f(x)                 # act by nonlinearity
+
+        # DNN to predict PID
+        pred_ids = self.nn2(x)
+
+        # DNN to predict p4
+        if self.nn3:
+            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
+            pred_p4 = self.nn3(nn3_input)
+        else:
+            pred_p4 = torch.zeros_like(data.ycand)
+
+        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
+
+
+class PFNet7_opt(nn.Module):
+    def __init__(self,
+        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
+        output_dim_id=6,
+        output_dim_p4=6,
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True):
+
+        super(PFNet7_opt, self).__init__()
+
+        self.target = target
+        self.nn1 = nn1
+        self.nn3 = nn3
+
+        self.act = nn.LeakyReLU
+        self.act_f = torch.nn.functional.leaky_relu
+        self.act_tanh = torch.nn.Tanh
+        self.elu = nn.ELU
 
+        # (1) DNN: encoding/decoding of all tracks and clusters
+        if self.nn1:
+            self.nn1 = nn.Sequential(
+                nn.Linear(input_dim, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
+                self.elu(),
+                nn.Linear(hidden_dim_nn1, input_encoding),
+            )
+        # (2) CNN: Gravnet layer
+        self.conv1 = pytorch_delphes.GravNetConv_optimized(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+        # (3) DNN layer: classifying PID
+        self.nn2 = nn.Sequential(
+            nn.Linear(encoding_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, hidden_dim),
+            self.elu(),
+            nn.Linear(hidden_dim, output_dim_id),
+        )
+        # (4) DNN layer: regressing p4
+        if self.nn3:
+            self.nn3 = nn.Sequential(
+                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, hidden_dim),
+                self.elu(),
+                nn.Linear(hidden_dim, output_dim_p4),
+            )
     def forward(self, data):
         x0 = data.x
 
diff --git a/mlpf/pytorch_delphes/model_optimized.py b/mlpf/pytorch_delphes/model_optimized.py
deleted file mode 100644
index c4dc9c06c..000000000
--- a/mlpf/pytorch_delphes/model_optimized.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import numpy as np
-import mplhep, time, os
-
-import torch
-import torch_geometric
-
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
-from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
-
-#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
-from gravnet_optimized import GravNetConv
-from torch_geometric.nn import GraphConv
-
-#Model with gravnet clustering
-class PFNet7(nn.Module):
-    def __init__(self,
-        input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
-        output_dim_id=6,
-        output_dim_p4=6,
-        space_dim=4, propagate_dimensions=22, nearest=16,
-        target="gen", nn1=True, nn3=True):
-
-        super(PFNet7, self).__init__()
-
-        self.target = target
-        self.nn1 = nn1
-        self.nn3 = nn3
-
-        self.act = nn.LeakyReLU
-        self.act_f = torch.nn.functional.leaky_relu
-        self.act_tanh = torch.nn.Tanh
-        self.elu = nn.ELU
-
-        # (1) DNN: encoding/decoding of all tracks and clusters
-        if self.nn1:
-            self.nn1 = nn.Sequential(
-                nn.Linear(input_dim, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, hidden_dim_nn1),
-                self.elu(),
-                nn.Linear(hidden_dim_nn1, input_encoding),
-            )
-
-        # (2) CNN: Gravnet layer
-        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
-
-        # (3) DNN layer: classifying PID
-        self.nn2 = nn.Sequential(
-            nn.Linear(encoding_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, hidden_dim),
-            self.elu(),
-            nn.Linear(hidden_dim, output_dim_id),
-        )
-
-        # (4) DNN layer: regressing p4
-        if self.nn3:
-            self.nn3 = nn.Sequential(
-                nn.Linear(input_dim + output_dim_id + encoding_dim, hidden_dim),
-                self.elu(),
-                nn.Linear(hidden_dim, hidden_dim),
-                self.elu(),
-                nn.Linear(hidden_dim, hidden_dim),
-                self.elu(),
-                nn.Linear(hidden_dim, output_dim_p4),
-            )
-
-    def forward(self, data):
-        x0 = data.x
-
-        # Encoder/Decoder step
-        if self.nn1:
-            x = self.nn1(x0)
-        else:
-            x=x0
-
-        # Gravnet step
-        x, edge_index, edge_weight = self.conv1(x)
-        x = self.act_f(x)                 # act by nonlinearity
-
-        # DNN to predict PID
-        pred_ids = self.nn2(x)
-
-        # DNN to predict p4
-        if self.nn3:
-            nn3_input = torch.cat([x0, pred_ids, x], axis=-1)
-            pred_p4 = self.nn3(nn3_input)
-        else:
-            pred_p4 = torch.zeros_like(data.ycand)
-
-        return pred_ids, pred_p4, data.ygen_id, data.ygen, data.ycand_id, data.ycand
diff --git a/mlpf/pytorch_delphes/pipeline.py b/mlpf/pytorch_delphes/pipeline.py
deleted file mode 100644
index a876b39df..000000000
--- a/mlpf/pytorch_delphes/pipeline.py
+++ /dev/null
@@ -1,509 +0,0 @@
-from glob import glob
-import sys, os
-sys.path.insert(1, '../../plotting/')
-sys.path.insert(1, '../../mlpf/plotting/')
-
-import os.path as osp
-import pickle as pkl
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import mplhep as hep
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-    print("GPU model:", torch.cuda.get_device_name(0))
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-from plot_utils import plot_confusion_matrix
-
-import evaluate
-from evaluate import make_plots, make_predictions
-from model import PFNet7
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        alpha,
-        task,
-        title)
-    return model_fname
-
-def compute_weights(gen_ids_one_hot, device):
-    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
-    weights = torch.zeros(output_dim_id).to(device=device)
-    for k, v in zip(vs, cs):
-        weights[k] = 1.0/math.sqrt(float(v))
-    return weights
-
-def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
-    plt.style.use(hep.style.ROOT)
-
-    if not os.path.exists(outpath + '/training_plots/'):
-        os.makedirs(outpath + '/training_plots/')
-
-    fig, ax = plt.subplots()
-    ax.plot(range(len(l)), l, label=label)
-    ax.set_xlabel(xlabel)
-    ax.set_ylabel(ylabel)
-    ax.legend(loc='best')
-    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
-    plt.close(fig)
-
-    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
-        pkl.dump(l, f)
-
-@torch.no_grad()
-def test(model, loader, epoch, alpha, target_type, device):
-    with torch.no_grad():
-        ret = train(model, loader, epoch, None, alpha, target_type, device)
-    return ret
-
-def train(model, loader, epoch, optimizer, alpha, target_type, device):
-
-    is_train = not (optimizer is None)
-
-    if is_train:
-        model.train()
-    else:
-        model.eval()
-
-    #loss values for each batch: classification, regression, total
-    losses_1, losses_2, losses_tot = [], [], []
-
-    #accuracy values for each batch (monitor classification performance)
-    accuracies_batch, accuracies_batch_msk = [], []
-
-    #setup confusion matrix
-    conf_matrix = np.zeros((output_dim_id, output_dim_id))
-
-    # to compute average inference time
-    t=[]
-
-    for i, batch in enumerate(loader):
-        t0 = time.time()
-
-        if multi_gpu:
-            X = batch
-        else:
-            X = batch.to(device)
-
-        ## make like tensorflow model, 0-padding events to 6k elements
-        # if X.x.shape[0]<6000:
-        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
-        #
-        #     X.x = new_X
-        #     X.ygen_id=new_ygen_id
-
-        # Forwardprop
-        if i<100:
-            ti = time.time()
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-            tf = time.time()
-            if i!=0:
-                t.append(round((tf-ti),2))
-        else:
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-
-        _, gen_ids = torch.max(gen_ids_one_hot, -1)
-        _, pred_ids = torch.max(pred_ids_one_hot, -1)
-        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
-
-        # masking
-        msk = ((pred_ids != 0) & (gen_ids != 0))
-        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
-
-        # computing loss
-        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
-        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
-        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
-
-        if args.classification_only:
-            loss = l1
-        else:
-            loss = l1+l2
-
-        if is_train:
-            # BACKPROP
-            #print(list(model.parameters())[1].grad)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-        losses_1.append(l1.detach().cpu().item())
-        losses_2.append(l2.detach().cpu().item())
-        losses_tot.append(loss.detach().cpu().item())
-
-        t1 = time.time()
-
-        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
-        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
-
-        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
-                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
-
-        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
-
-    print("Average Inference time per event is: ", round((sum(t) / len(t)),2), 's')
-
-    losses_1 = np.mean(losses_1)
-    losses_2 = np.mean(losses_2)
-    losses_tot = np.mean(losses_tot)
-
-    acc = np.mean(accuracies_batch)
-    acc_msk = np.mean(accuracies_batch_msk)
-
-    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
-
-    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
-
-
-def train_loop():
-    t0_initial = time.time()
-
-    losses_1_train, losses_2_train, losses_tot_train = [], [], []
-    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
-
-    accuracies_train, accuracies_msk_train = [], []
-    accuracies_valid, accuracies_msk_valid = [], []
-
-    best_val_loss = 99999.9
-    stale_epochs = 0
-
-    print("Training over {} epochs".format(args.n_epochs))
-    for epoch in range(args.n_epochs):
-        t0 = time.time()
-
-        if stale_epochs > patience:
-            print("breaking due to stale epochs")
-            break
-
-        # training epoch
-        model.train()
-        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
-
-        losses_tot_train.append(losses_tot)
-        losses_1_train.append(losses_1)
-        losses_2_train.append(losses_2)
-
-        accuracies_train.append(acc)
-        accuracies_msk_train.append(acc_msk)
-
-        # validation step
-        model.eval()
-        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
-
-        losses_tot_valid.append(losses_tot_v)
-        losses_1_valid.append(losses_1_v)
-        losses_2_valid.append(losses_2_v)
-
-        accuracies_valid.append(acc_v)
-        accuracies_msk_valid.append(acc_msk_v)
-
-        # early-stopping
-        if losses_tot_v < best_val_loss:
-            best_val_loss = losses_tot_v
-            stale_epochs = 0
-        else:
-            stale_epochs += 1
-
-        t1 = time.time()
-
-        epochs_remaining = args.n_epochs - (epoch+1)
-        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
-        eta = epochs_remaining*time_per_epoch/60
-
-        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
-            epoch+1, args.n_epochs,
-            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
-            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
-
-        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
-
-        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
-        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
-
-        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
-        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
-
-    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
-    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
-    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
-
-    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
-    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
-    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
-
-    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
-    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
-
-    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
-    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
-
-    print('Done with training.')
-
-    return
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
-    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
-    # 'load': True, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
-    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.dataset)
-    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'hidden_dim_nn1': args.hidden_dim_nn1,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'space_dim': args.space_dim,
-                    'propagate_dimensions': args.propagate_dimensions,
-                    'nearest': args.nearest,
-                    'target': args.target,
-                    'nn1': args.nn1,
-                    'nn3': args.nn3}
-
-    if args.load:
-            print('Loading a previously trained model..')
-            model = model_class(**model_kwargs)
-            outpath = args.outpath + args.load_model
-            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
-
-            state_dict = torch.load(PATH, map_location=device)
-
-            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
-                state_dict = torch.load(PATH, map_location=device)
-                from collections import OrderedDict
-                new_state_dict = OrderedDict()
-                for k, v in state_dict.items():
-                    name = k[7:] # remove module.
-                    new_state_dict[name] = v
-                    # print('name is:', name)
-                state_dict=new_state_dict
-
-            model.load_state_dict(state_dict)
-
-            if multi_gpu:
-                model = torch_geometric.nn.DataParallel(model)
-                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-            model.to(device)
-
-            if args.train:
-                print("Training a previously trained model..")
-
-    elif args.train:
-        #instantiate the model
-        print('Instantiating a model..')
-        model = model_class(**model_kwargs)
-
-        if multi_gpu:
-            print("Parallelizing the training..")
-            model = torch_geometric.nn.DataParallel(model)
-            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-        model.to(device)
-
-    if args.train:
-        args.title=args.title+'noskip'
-        if args.nn1:
-            args.title=args.title+'_nn1'
-        if args.nn3:
-            args.title=args.title+'_nn3'
-
-        if args.classification_only:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
-        else:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
-
-        outpath = osp.join(args.outpath, model_fname)
-        if osp.isdir(outpath):
-            if args.overwrite:
-                print("model output {} already exists, deleting it".format(outpath))
-                import shutil
-                shutil.rmtree(outpath)
-            else:
-                print("model output {} already exists, please delete it".format(outpath))
-                sys.exit(0)
-        try:
-            os.makedirs(outpath)
-        except Exception as e:
-            pass
-
-        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
-            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
-
-        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
-            os.makedirs(outpath + '/confusion_matrix_plots/')
-
-        if args.optimizer == "adam":
-            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-        elif args.optimizer == "adamw":
-            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-
-        print(model)
-        print(model_fname)
-
-        model.train()
-        train_loop()
-
-    model.eval()
-
-    # evaluate on training data..
-    if not osp.isdir(outpath+'/train_loader'):
-        os.makedirs(outpath+'/train_loader')
-    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
-        os.makedirs(outpath+'/train_loader/resolution_plots')
-    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
-        os.makedirs(outpath+'/train_loader/distribution_plots')
-    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/train_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
-        os.makedirs(outpath+'/train_loader/efficiency_plots')
-
-    if args.make_predictions_train:
-        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-    if args.make_plots_train:
-        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-
-    # evaluate on validation data..
-    if not osp.isdir(outpath+'/valid_loader'):
-        os.makedirs(outpath+'/valid_loader')
-    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
-        os.makedirs(outpath+'/valid_loader/resolution_plots')
-    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
-        os.makedirs(outpath+'/valid_loader/distribution_plots')
-    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
-        os.makedirs(outpath+'/valid_loader/efficiency_plots')
-
-    if args.make_predictions_valid:
-        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-    if args.make_plots_valid:
-        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-
-    # evaluate on testing data..
-    if not osp.isdir(outpath+'/test_loader'):
-        os.makedirs(outpath+'/test_loader')
-    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
-        os.makedirs(outpath+'/test_loader/resolution_plots')
-    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
-        os.makedirs(outpath+'/test_loader/distribution_plots')
-    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/test_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
-        os.makedirs(outpath+'/test_loader/efficiency_plots')
-
-    if args.make_predictions_test:
-        if args.load:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-    if args.make_plots_test:
-        if args.load:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-
-
-## -----------------------------------------------------------
-# to retrieve a stored variable in pkl file
-# import pickle as pkl
-# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
-#     a = pkl.load(f)
-#
-# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
-#     data = pkl.load(pickle_file)
-#
-# data.keys()
diff --git a/mlpf/pytorch_delphes/pipeline_optimized.py b/mlpf/pytorch_delphes/pipeline_optimized.py
deleted file mode 100644
index f1d5ffbb4..000000000
--- a/mlpf/pytorch_delphes/pipeline_optimized.py
+++ /dev/null
@@ -1,509 +0,0 @@
-from glob import glob
-import sys, os
-sys.path.insert(1, '../../plotting/')
-sys.path.insert(1, '../../mlpf/plotting/')
-
-import os.path as osp
-import pickle as pkl
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import mplhep as hep
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-    print("GPU model:", torch.cuda.get_device_name(0))
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-from plot_utils import plot_confusion_matrix
-
-import evaluate
-from evaluate import make_plots, make_predictions
-from model_optimized import PFNet7
-
-#Ignore divide by 0 errors
-np.seterr(divide='ignore', invalid='ignore')
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        alpha,
-        task,
-        title)
-    return model_fname
-
-def compute_weights(gen_ids_one_hot, device):
-    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
-    weights = torch.zeros(output_dim_id).to(device=device)
-    for k, v in zip(vs, cs):
-        weights[k] = 1.0/math.sqrt(float(v))
-    return weights
-
-def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
-    plt.style.use(hep.style.ROOT)
-
-    if not os.path.exists(outpath + '/training_plots/'):
-        os.makedirs(outpath + '/training_plots/')
-
-    fig, ax = plt.subplots()
-    ax.plot(range(len(l)), l, label=label)
-    ax.set_xlabel(xlabel)
-    ax.set_ylabel(ylabel)
-    ax.legend(loc='best')
-    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
-    plt.close(fig)
-
-    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
-        pkl.dump(l, f)
-
-@torch.no_grad()
-def test(model, loader, epoch, alpha, target_type, device):
-    with torch.no_grad():
-        ret = train(model, loader, epoch, None, alpha, target_type, device)
-    return ret
-
-def train(model, loader, epoch, optimizer, alpha, target_type, device):
-
-    is_train = not (optimizer is None)
-
-    if is_train:
-        model.train()
-    else:
-        model.eval()
-
-    #loss values for each batch: classification, regression, total
-    losses_1, losses_2, losses_tot = [], [], []
-
-    #accuracy values for each batch (monitor classification performance)
-    accuracies_batch, accuracies_batch_msk = [], []
-
-    #setup confusion matrix
-    conf_matrix = np.zeros((output_dim_id, output_dim_id))
-
-    # to compute average inference time
-    t=[]
-
-    for i, batch in enumerate(loader):
-        t0 = time.time()
-
-        if multi_gpu:
-            X = batch
-        else:
-            X = batch.to(device)
-
-        ## make like tensorflow model, 0-padding events to 6k elements
-        # if X.x.shape[0]<6000:
-        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
-        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
-        #
-        #     X.x = new_X
-        #     X.ygen_id=new_ygen_id
-
-        # Forwardprop
-        if i<100:
-            ti = time.time()
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-            tf = time.time()
-            if i!=0:
-                t.append(round((tf-ti),2))
-        else:
-            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
-
-        _, gen_ids = torch.max(gen_ids_one_hot, -1)
-        _, pred_ids = torch.max(pred_ids_one_hot, -1)
-        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
-
-        # masking
-        msk = ((pred_ids != 0) & (gen_ids != 0))
-        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
-
-        # computing loss
-        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device)
-        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
-        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
-
-        if args.classification_only:
-            loss = l1
-        else:
-            loss = l1+l2
-
-        if is_train:
-            # BACKPROP
-            #print(list(model.parameters())[1].grad)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-        losses_1.append(l1.detach().cpu().item())
-        losses_2.append(l2.detach().cpu().item())
-        losses_tot.append(loss.detach().cpu().item())
-
-        t1 = time.time()
-
-        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
-        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
-
-        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
-                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
-
-        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
-
-    print("Average Inference time per event is: ", round((sum(t) / len(t)),2), 's')
-
-    losses_1 = np.mean(losses_1)
-    losses_2 = np.mean(losses_2)
-    losses_tot = np.mean(losses_tot)
-
-    acc = np.mean(accuracies_batch)
-    acc_msk = np.mean(accuracies_batch_msk)
-
-    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
-
-    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
-
-
-def train_loop():
-    t0_initial = time.time()
-
-    losses_1_train, losses_2_train, losses_tot_train = [], [], []
-    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
-
-    accuracies_train, accuracies_msk_train = [], []
-    accuracies_valid, accuracies_msk_valid = [], []
-
-    best_val_loss = 99999.9
-    stale_epochs = 0
-
-    print("Training over {} epochs".format(args.n_epochs))
-    for epoch in range(args.n_epochs):
-        t0 = time.time()
-
-        if stale_epochs > patience:
-            print("breaking due to stale epochs")
-            break
-
-        # training epoch
-        model.train()
-        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, train_loader, epoch, optimizer, args.alpha, args.target, device)
-
-        losses_tot_train.append(losses_tot)
-        losses_1_train.append(losses_1)
-        losses_2_train.append(losses_2)
-
-        accuracies_train.append(acc)
-        accuracies_msk_train.append(acc_msk)
-
-        # validation step
-        model.eval()
-        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, valid_loader, epoch, args.alpha, args.target, device)
-
-        losses_tot_valid.append(losses_tot_v)
-        losses_1_valid.append(losses_1_v)
-        losses_2_valid.append(losses_2_v)
-
-        accuracies_valid.append(acc_v)
-        accuracies_msk_valid.append(acc_msk_v)
-
-        # early-stopping
-        if losses_tot_v < best_val_loss:
-            best_val_loss = losses_tot_v
-            stale_epochs = 0
-        else:
-            stale_epochs += 1
-
-        t1 = time.time()
-
-        epochs_remaining = args.n_epochs - (epoch+1)
-        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
-        eta = epochs_remaining*time_per_epoch/60
-
-        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
-            epoch+1, args.n_epochs,
-            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
-            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
-
-        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
-
-        plot_confusion_matrix(conf_matrix_norm, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch), epoch=epoch)
-        plot_confusion_matrix(conf_matrix_norm_v, ["none", "ch.had", "n.had", "g", "el", "mu"], fname = outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch), epoch=epoch)
-
-        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
-        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
-
-    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
-    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
-    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
-
-    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
-    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
-    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
-
-    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
-    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
-
-    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
-    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
-
-    print('Done with training.')
-
-    return
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # # the next part initializes some args values (to run the script not from terminal)
-    # class objectview(object):
-    #     def __init__(self, d):
-    #         self.__dict__ = d
-    #
-    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 15, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
-    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
-    # 'load': True, 'load_epoch': 9, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
-    # 'make_predictions_train': True, 'make_plots_train': True, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.dataset)
-    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'hidden_dim_nn1': args.hidden_dim_nn1,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'space_dim': args.space_dim,
-                    'propagate_dimensions': args.propagate_dimensions,
-                    'nearest': args.nearest,
-                    'target': args.target,
-                    'nn1': args.nn1,
-                    'nn3': args.nn3}
-
-    if args.load:
-            print('Loading a previously trained model..')
-            model = model_class(**model_kwargs)
-            outpath = args.outpath + args.load_model
-            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
-
-            state_dict = torch.load(PATH, map_location=device)
-
-            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
-                state_dict = torch.load(PATH, map_location=device)
-                from collections import OrderedDict
-                new_state_dict = OrderedDict()
-                for k, v in state_dict.items():
-                    name = k[7:] # remove module.
-                    new_state_dict[name] = v
-                    # print('name is:', name)
-                state_dict=new_state_dict
-
-            model.load_state_dict(state_dict)
-
-            if multi_gpu:
-                model = torch_geometric.nn.DataParallel(model)
-                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-            model.to(device)
-
-            if args.train:
-                print("Training a previously trained model..")
-
-    elif args.train:
-        #instantiate the model
-        print('Instantiating a model..')
-        model = model_class(**model_kwargs)
-
-        if multi_gpu:
-            print("Parallelizing the training..")
-            model = torch_geometric.nn.DataParallel(model)
-            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
-
-        model.to(device)
-
-    if args.train:
-        args.title=args.title+'noskip'
-        if args.nn1:
-            args.title=args.title+'_nn1'
-        if args.nn3:
-            args.title=args.title+'_nn3'
-
-        if args.classification_only:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
-        else:
-            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
-
-        outpath = osp.join(args.outpath, model_fname)
-        if osp.isdir(outpath):
-            if args.overwrite:
-                print("model output {} already exists, deleting it".format(outpath))
-                import shutil
-                shutil.rmtree(outpath)
-            else:
-                print("model output {} already exists, please delete it".format(outpath))
-                sys.exit(0)
-        try:
-            os.makedirs(outpath)
-        except Exception as e:
-            pass
-
-        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
-            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
-
-        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
-            os.makedirs(outpath + '/confusion_matrix_plots/')
-
-        if args.optimizer == "adam":
-            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-        elif args.optimizer == "adamw":
-            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-
-        print(model)
-        print(model_fname)
-
-        model.train()
-        train_loop()
-
-    model.eval()
-
-    # evaluate on training data..
-    if not osp.isdir(outpath+'/train_loader'):
-        os.makedirs(outpath+'/train_loader')
-    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
-        os.makedirs(outpath+'/train_loader/resolution_plots')
-    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
-        os.makedirs(outpath+'/train_loader/distribution_plots')
-    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/train_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
-        os.makedirs(outpath+'/train_loader/efficiency_plots')
-
-    if args.make_predictions_train:
-        make_predictions(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-    if args.make_plots_train:
-        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
-
-    # evaluate on validation data..
-    if not osp.isdir(outpath+'/valid_loader'):
-        os.makedirs(outpath+'/valid_loader')
-    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
-        os.makedirs(outpath+'/valid_loader/resolution_plots')
-    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
-        os.makedirs(outpath+'/valid_loader/distribution_plots')
-    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
-        os.makedirs(outpath+'/valid_loader/efficiency_plots')
-
-    if args.make_predictions_valid:
-        make_predictions(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-    if args.make_plots_valid:
-        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
-
-    # evaluate on testing data..
-    if not osp.isdir(outpath+'/test_loader'):
-        os.makedirs(outpath+'/test_loader')
-    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
-        os.makedirs(outpath+'/test_loader/resolution_plots')
-    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
-        os.makedirs(outpath+'/test_loader/distribution_plots')
-    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/test_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
-        os.makedirs(outpath+'/test_loader/efficiency_plots')
-
-    if args.make_predictions_test:
-        if args.load:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_predictions(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-    if args.make_plots_test:
-        if args.load:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
-        else:
-            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
-
-
-## -----------------------------------------------------------
-# to retrieve a stored variable in pkl file
-# import pickle as pkl
-# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
-#     a = pkl.load(f)
-#
-# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
-#     data = pkl.load(pickle_file)
-#
-# data.keys()
diff --git a/mlpf/pytorch_delphes/training.py b/mlpf/pytorch_delphes/training.py
new file mode 100644
index 000000000..6127696f0
--- /dev/null
+++ b/mlpf/pytorch_delphes/training.py
@@ -0,0 +1,231 @@
+import os
+import pickle as pkl
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+import torch
+
+import pytorch_delphes
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+def compute_weights(gen_ids_one_hot, device, output_dim_id):
+    vs, cs = torch.unique(gen_ids_one_hot, return_counts=True)
+    weights = torch.zeros(output_dim_id).to(device=device)
+    for k, v in zip(vs, cs):
+        weights[k] = 1.0/math.sqrt(float(v))
+    return weights
+
+def make_plot_from_list(l, label, xlabel, ylabel, outpath, save_as):
+    plt.style.use(hep.style.ROOT)
+
+    if not os.path.exists(outpath + '/training_plots/'):
+        os.makedirs(outpath + '/training_plots/')
+
+    fig, ax = plt.subplots()
+    ax.plot(range(len(l)), l, label=label)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.legend(loc='best')
+    plt.savefig(outpath + '/training_plots/' + save_as + '.png')
+    plt.close(fig)
+
+    with open(outpath + '/training_plots/' + save_as + '.pkl', 'wb') as f:
+        pkl.dump(l, f)
+
+@torch.no_grad()
+def test(model, multi_gpu, loader, epoch, alpha, target_type, device, output_dim_id, classification_only, outpath):
+    with torch.no_grad():
+        ret = train(model, multi_gpu, loader, epoch, None, alpha, target_type, device, output_dim_id, classification_only, outpath)
+    return ret
+
+def train(model, multi_gpu, loader, epoch, optimizer, alpha, target_type, device, output_dim_id, classification_only, outpath):
+
+    is_train = not (optimizer is None)
+
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+
+    #loss values for each batch: classification, regression, total
+    losses_1, losses_2, losses_tot = [], [], []
+
+    #accuracy values for each batch (monitor classification performance)
+    accuracies_batch, accuracies_batch_msk = [], []
+
+    #setup confusion matrix
+    conf_matrix = np.zeros((output_dim_id, output_dim_id))
+
+    # to compute average inference time
+    t=[]
+
+    for i, batch in enumerate(loader):
+        t0 = time.time()
+
+        if multi_gpu:
+            X = batch
+        else:
+            X = batch.to(device)
+
+        ## make like tensorflow model, 0-padding events to 6k elements
+        # if X.x.shape[0]<6000:
+        #     new_X = torch.cat([X.x,torch.zeros_like(X.x)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id = torch.cat([X.ygen_id,torch.zeros_like(X.ygen_id)[:6000-X.x.shape[0],:]])
+        #     new_ygen_id[X.x.shape[0]:,0]=new_ygen_id[X.x.shape[0]:,0]+1
+        #
+        #     X.x = new_X
+        #     X.ygen_id=new_ygen_id
+
+        # Forwardprop
+        if i<100:
+            ti = time.time()
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+            tf = time.time()
+            if i!=0:
+                t.append(round((tf-ti),2))
+        else:
+            pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4 = model(X)
+
+        _, gen_ids = torch.max(gen_ids_one_hot, -1)
+        _, pred_ids = torch.max(pred_ids_one_hot, -1)
+        _, cand_ids = torch.max(cand_ids_one_hot, -1)     # rule-based result
+
+        # masking
+        msk = ((pred_ids != 0) & (gen_ids != 0))
+        msk2 = ((pred_ids != 0) & (pred_ids == gen_ids))
+
+        # computing loss
+        weights = compute_weights(torch.max(gen_ids_one_hot,-1)[1], device, output_dim_id)
+        l1 = torch.nn.functional.cross_entropy(pred_ids_one_hot, gen_ids, weight=weights) # for classifying PID
+        l2 = alpha * torch.nn.functional.mse_loss(pred_p4[msk2], gen_p4[msk2])  # for regressing p4
+
+        if classification_only:
+            loss = l1
+        else:
+            loss = l1+l2
+
+        if is_train:
+            # BACKPROP
+            #print(list(model.parameters())[1].grad)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        losses_1.append(l1.detach().cpu().item())
+        losses_2.append(l2.detach().cpu().item())
+        losses_tot.append(loss.detach().cpu().item())
+
+        t1 = time.time()
+
+        accuracies_batch.append(accuracy_score(gen_ids.detach().cpu().numpy(), pred_ids.detach().cpu().numpy()))
+        accuracies_batch_msk.append(accuracy_score(gen_ids[msk].detach().cpu().numpy(), pred_ids[msk].detach().cpu().numpy()))
+
+        conf_matrix += sklearn.metrics.confusion_matrix(gen_ids.detach().cpu().numpy(),
+                                        np.argmax(pred_ids_one_hot.detach().cpu().numpy(),axis=1), labels=range(6))
+
+        print('{}/{} batch_loss={:.2f} dt={:.1f}s'.format(i, len(loader), loss.detach().cpu().item(), t1-t0), end='\r', flush=True)
+
+    print("Average Inference time per event is: ", round((sum(t) / len(t)),2), 's')
+
+    losses_1 = np.mean(losses_1)
+    losses_2 = np.mean(losses_2)
+    losses_tot = np.mean(losses_tot)
+
+    acc = np.mean(accuracies_batch)
+    acc_msk = np.mean(accuracies_batch_msk)
+
+    conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]
+
+    return losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm
+
+
+def train_loop(model, device, multi_gpu, train_loader, valid_loader, test_loader, n_epochs, patience, optimizer, alpha, target, output_dim_id, classification_only, outpath):
+    t0_initial = time.time()
+
+    losses_1_train, losses_2_train, losses_tot_train = [], [], []
+    losses_1_valid, losses_2_valid, losses_tot_valid  = [], [], []
+
+    accuracies_train, accuracies_msk_train = [], []
+    accuracies_valid, accuracies_msk_valid = [], []
+
+    best_val_loss = 99999.9
+    stale_epochs = 0
+
+    print("Training over {} epochs".format(n_epochs))
+    for epoch in range(n_epochs):
+        t0 = time.time()
+
+        if stale_epochs > patience:
+            print("breaking due to stale epochs")
+            break
+
+        # training epoch
+        model.train()
+        losses_tot, losses_1, losses_2, acc, acc_msk, conf_matrix, conf_matrix_norm = train(model, multi_gpu, train_loader, epoch, optimizer, alpha, target, device, output_dim_id, classification_only, outpath)
+
+        losses_tot_train.append(losses_tot)
+        losses_1_train.append(losses_1)
+        losses_2_train.append(losses_2)
+
+        accuracies_train.append(acc)
+        accuracies_msk_train.append(acc_msk)
+
+        # validation step
+        model.eval()
+        losses_tot_v, losses_1_v, losses_2_v, acc_v, acc_msk_v, conf_matrix_v, conf_matrix_norm_v = test(model, multi_gpu, valid_loader, epoch, alpha, target, device, output_dim_id, classification_only, outpath)
+
+        losses_tot_valid.append(losses_tot_v)
+        losses_1_valid.append(losses_1_v)
+        losses_2_valid.append(losses_2_v)
+
+        accuracies_valid.append(acc_v)
+        accuracies_msk_valid.append(acc_msk_v)
+
+        # early-stopping
+        if losses_tot_v < best_val_loss:
+            best_val_loss = losses_tot_v
+            stale_epochs = 0
+        else:
+            stale_epochs += 1
+
+        t1 = time.time()
+
+        epochs_remaining = n_epochs - (epoch+1)
+        time_per_epoch = (t1 - t0_initial)/(epoch + 1)
+        eta = epochs_remaining*time_per_epoch/60
+
+        print("epoch={}/{} dt={:.2f}min train_loss={:.5f} valid_loss={:.5f} train_acc={:.5f} valid_acc={:.5f} train_acc_msk={:.5f} valid_acc_msk={:.5f} stale={} eta={:.1f}m".format(
+            epoch+1, n_epochs,
+            (t1-t0)/60, losses_tot_train[epoch], losses_tot_valid[epoch], accuracies_train[epoch], accuracies_valid[epoch],
+            accuracies_msk_train[epoch], accuracies_msk_valid[epoch], stale_epochs, eta))
+
+        torch.save(model.state_dict(), "{0}/epoch_{1}_weights.pth".format(outpath, epoch))
+
+        torch.save(conf_matrix_norm, outpath + '/confusion_matrix_plots/cmT_normed_epoch_' + str(epoch) + '.pt')
+        torch.save(conf_matrix_norm_v, outpath + '/confusion_matrix_plots/cmV_normed_epoch_' + str(epoch) + '.pkl')
+
+    make_plot_from_list(losses_tot_train, 'train loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_train')
+    make_plot_from_list(losses_1_train, 'train loss_1', 'Epochs', 'Loss', outpath, 'losses_1_train')
+    make_plot_from_list(losses_2_train, 'train loss_2', 'Epochs', 'Loss', outpath, 'losses_2_train')
+
+    make_plot_from_list(losses_tot_valid, 'valid loss_tot', 'Epochs', 'Loss', outpath, 'losses_tot_valid')
+    make_plot_from_list(losses_1_valid, 'valid loss_1', 'Epochs', 'Loss', outpath, 'losses_1_valid')
+    make_plot_from_list(losses_2_valid, 'valid loss_2', 'Epochs', 'Loss', outpath, 'losses_2_valid')
+
+    make_plot_from_list(accuracies_train, 'train accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_train')
+    make_plot_from_list(accuracies_msk_train, 'train accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_train')
+
+    make_plot_from_list(accuracies_valid, 'valid accuracy', 'Epochs', 'Accuracy', outpath, 'accuracies_valid')
+    make_plot_from_list(accuracies_msk_valid, 'valid accuracy_msk', 'Epochs', 'Accuracy', outpath, 'accuracies_msk_valid')
+
+    print('Done with training.')
+    return
diff --git a/mlpf/pytorch_pipeline.py b/mlpf/pytorch_pipeline.py
new file mode 100644
index 000000000..6de5264a4
--- /dev/null
+++ b/mlpf/pytorch_pipeline.py
@@ -0,0 +1,279 @@
+from glob import glob
+import sys, os
+
+import os.path as osp
+import pickle as pkl
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import mplhep as hep
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+
+import pytorch_delphes
+import plotting
+
+#Ignore divide by 0 errors
+np.seterr(divide='ignore', invalid='ignore')
+
+#Get a unique directory name for the model
+def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, alpha, task, title):
+    model_name = type(model).__name__
+    model_params = sum(p.numel() for p in model.parameters())
+    import hashlib
+    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
+    model_user = os.environ['USER']
+
+    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_alpha_{}_{}_{}'.format(
+        model_name,
+        target_type,
+        n_train,
+        n_epochs,
+        batch_size,
+        lr,
+        alpha,
+        task,
+        title)
+    return model_fname
+
+
+if __name__ == "__main__":
+
+    # args = pytorch_delphes.parse_args()
+
+    # the next part initializes some args values (to run the script not from terminal)
+    class objectview(object):
+        def __init__(self, d):
+            self.__dict__ = d
+
+    args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 2, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
+    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    'outpath': '../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
+    'load': False, 'load_epoch': 1, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_2_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
+    'make_predictions_train': False, 'make_plots_train': False, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True,
+    'optimized': False})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_ttbar = pytorch_delphes.PFGraphDataset(args.dataset)
+    full_dataset_qcd = pytorch_delphes.PFGraphDataset(args.dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loaders..')
+    train_loader, valid_loader = pytorch_delphes.data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = pytorch_delphes.data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    if args.optimized:
+        model_classes = {"PFNet7": pytorch_delphes.PFNet7_opt}
+    else:
+        model_classes = {"PFNet7": pytorch_delphes.PFNet7}
+
+    model_class = model_classes[args.model]
+    model_kwargs = {'input_dim': input_dim,
+                    'hidden_dim': args.hidden_dim,
+                    'hidden_dim_nn1': args.hidden_dim_nn1,
+                    'input_encoding': args.input_encoding,
+                    'encoding_dim': args.encoding_dim,
+                    'output_dim_id': output_dim_id,
+                    'output_dim_p4': output_dim_p4,
+                    'space_dim': args.space_dim,
+                    'propagate_dimensions': args.propagate_dimensions,
+                    'nearest': args.nearest,
+                    'target': args.target,
+                    'nn1': args.nn1,
+                    'nn3': args.nn3}
+
+    if args.load:
+            print('Loading a previously trained model..')
+            model = model_class(**model_kwargs)
+            outpath = args.outpath + args.load_model
+            PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
+
+            state_dict = torch.load(PATH, map_location=device)
+
+            if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
+                state_dict = torch.load(PATH, map_location=device)
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove module.
+                    new_state_dict[name] = v
+                    # print('name is:', name)
+                state_dict=new_state_dict
+
+            model.load_state_dict(state_dict)
+
+            if multi_gpu:
+                model = torch_geometric.nn.DataParallel(model)
+                #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+            model.to(device)
+
+            if args.train:
+                print("Training a previously trained model..")
+
+    elif args.train:
+        #instantiate the model
+        print('Instantiating a model..')
+        model = model_class(**model_kwargs)
+
+        if multi_gpu:
+            print("Parallelizing the training..")
+            model = torch_geometric.nn.DataParallel(model)
+            #model = torch.nn.parallel.DistributedDataParallel(model)    ### TODO: make it compatible with DDP
+
+        model.to(device)
+
+    if args.train:
+        args.title=args.title+'noskip'
+        if args.nn1:
+            args.title=args.title+'_nn1'
+        if args.nn3:
+            args.title=args.title+'_nn3'
+
+        if args.classification_only:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
+        else:
+            model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size,  args.alpha, "both", args.title)
+
+        outpath = osp.join(args.outpath, model_fname)
+        if osp.isdir(outpath):
+            if args.overwrite:
+                print("model output {} already exists, deleting it".format(outpath))
+                import shutil
+                shutil.rmtree(outpath)
+            else:
+                print("model output {} already exists, please delete it".format(outpath))
+                sys.exit(0)
+        try:
+            os.makedirs(outpath)
+        except Exception as e:
+            pass
+
+        with open('{}/model_kwargs.pkl'.format(outpath), 'wb') as f:
+            pkl.dump(model_kwargs, f,  protocol=pkl.HIGHEST_PROTOCOL)
+
+        if not os.path.exists(outpath + '/confusion_matrix_plots/'):
+            os.makedirs(outpath + '/confusion_matrix_plots/')
+
+        if args.optimizer == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+        elif args.optimizer == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+
+        print(model)
+        print(model_fname)
+
+        model.train()
+        pytorch_delphes.train_loop(model, device, multi_gpu,
+                                   train_loader, valid_loader, test_loader,
+                                   args.n_epochs, args.patience, optimizer, args.alpha, args.target,
+                                   output_dim_id, args.classification_only, outpath)
+
+    model.eval()
+
+    # evaluate on training data..
+    if not osp.isdir(outpath+'/train_loader'):
+        os.makedirs(outpath+'/train_loader')
+    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
+        os.makedirs(outpath+'/train_loader/resolution_plots')
+    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
+        os.makedirs(outpath+'/train_loader/distribution_plots')
+    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/train_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
+        os.makedirs(outpath+'/train_loader/efficiency_plots')
+
+    if args.make_predictions_train:
+        pytorch_delphes.make_predictions(model, multi_gpu, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+    if args.make_plots_train:
+        plotting.make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+
+    # evaluate on validation data..
+    if not osp.isdir(outpath+'/valid_loader'):
+        os.makedirs(outpath+'/valid_loader')
+    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
+        os.makedirs(outpath+'/valid_loader/resolution_plots')
+    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
+        os.makedirs(outpath+'/valid_loader/distribution_plots')
+    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
+        os.makedirs(outpath+'/valid_loader/efficiency_plots')
+
+    if args.make_predictions_valid:
+        pytorch_delphes.make_predictions(model, multi_gpu, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+    if args.make_plots_valid:
+        plotting.make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+
+    # evaluate on testing data..
+    if not osp.isdir(outpath+'/test_loader'):
+        os.makedirs(outpath+'/test_loader')
+    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
+        os.makedirs(outpath+'/test_loader/resolution_plots')
+    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
+        os.makedirs(outpath+'/test_loader/distribution_plots')
+    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/test_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
+        os.makedirs(outpath+'/test_loader/efficiency_plots')
+
+    if args.make_predictions_test:
+        if args.load:
+            pytorch_delphes.make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            pytorch_delphes.make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+    if args.make_plots_test:
+        if args.load:
+            plotting.make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+        else:
+            plotting.make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+
+
+## -----------------------------------------------------------
+# to retrieve a stored variable in pkl file
+# import pickle as pkl
+# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
+#     a = pkl.load(f)
+#
+# with open('../../data/pythia8_qcd/raw/tev14_pythia8_qcd_10_0.pkl', 'rb') as pickle_file:
+#     data = pkl.load(pickle_file)
+#
+# data.keys()
diff --git a/scripts/local_test_cms.sh b/scripts/local_test_cms.sh
deleted file mode 100755
index b95b0c9f9..000000000
--- a/scripts/local_test_cms.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-set -e
-
-rm -Rf test_tmp
-mkdir test_tmp
-cd test_tmp
-
-mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi
-cd data/TTbar_14TeV_TuneCUETP8M1_cfi
-
-#download the root input file
-wget --no-check-certificate https://login-1.hep.caltech.edu/~jpata/particleflow/2020-07/TTbar_14TeV_TuneCUETP8M1_cfi/pfntuple_1.root
-cd ../..
-
-mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
-mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/processed
-
-#generate pickle data files from root
-python3 ../mlpf/data/postprocessing2.py --input data/TTbar_14TeV_TuneCUETP8M1_cfi/pfntuple_1.root \
-  --events-per-file 1 --outpath data/TTbar_14TeV_TuneCUETP8M1_cfi/raw --save-normalized-table
-
-#generate pytorch data files
-python3 ../mlpf/pytorch/graph_data_cms.py --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi \
-  --processed_dir data/TTbar_14TeV_TuneCUETP8M1_cfi/processed --num-files-merge 1 --num-proc 1
-
-#run the pytorch training
-COMET_API_KEY="bla" python3 ../mlpf/pytorch/train_end2end_cms.py \
-  --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi --space_dim 2 --n_train 3 \
-  --n_val 2 --model PFNet7 --convlayer gravnet-radius --convlayer2 sgconv \
-  --lr 0.0001 --hidden_dim 32 --n_epochs 2 --l1 1.0 --l2 0.001 --target cand \
-  --batch_size 1 --dropout 0.2 --disable_comet
-
-# #generate dataframe with predictions from the pytorch model
-python3 ../mlpf/pytorch/eval_end2end_cms.py --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi \
-  --path data/PFNet* --model PFNet7 --start 3 --stop 5 --epoch 1
-
-export OUTFILE=`find data -name df.pkl.bz2 | head -n1`
-du $OUTFILE
-python3 ../mlpf/plotting/plots_cms.py --pkl $OUTFILE --target cand
diff --git a/scripts/local_test_cms_pytorch.sh b/scripts/local_test_cms_pytorch.sh
new file mode 100755
index 000000000..bbb6968ea
--- /dev/null
+++ b/scripts/local_test_cms_pytorch.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# set -e
+#
+# rm -Rf test_tmp
+# mkdir test_tmp
+cd test_tmp
+
+# mkdir -p experiments
+# mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi
+# cd data/TTbar_14TeV_TuneCUETP8M1_cfi
+#
+# #download the root input file
+# wget --no-check-certificate https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root
+# cd ../..
+#
+# mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
+# mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/processed
+#
+# #generate pickle data files from root
+# python3 ../../mlpf/data/postprocessing2.py --input data/TTbar_14TeV_TuneCUETP8M1_cfi/pfntuple_1.root \
+#   --events-per-file 1 --outpath data/TTbar_14TeV_TuneCUETP8M1_cfi/raw --save-normalized-table
+
+#generate pytorch data files
+python3 ../../mlpf/pytorch_cms/graph_data_delphes.py --dataset data/TTbar_14TeV_TuneCUETP8M1_cfi \
+  --processed_dir data/TTbar_14TeV_TuneCUETP8M1_cfi/processed --num-files-merge 1 --num-proc 1
+#
+# #run the pytorch training
+# echo Beginning the training..
+# python3 pipeline_cms.py \
+#   --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
+#   --dataset='../../test_tmp/data/TTbar_14TeV_TuneCUETP8M1_cfi' \
+#   --outpath='../../test_tmp/experiments'

From 56b970e40d66e432db03f66632557ccb0b33ae1e Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 14:52:29 -0700
Subject: [PATCH 083/157] uncomment args

---
 mlpf/pytorch_pipeline.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/mlpf/pytorch_pipeline.py b/mlpf/pytorch_pipeline.py
index 6de5264a4..6a28e6922 100644
--- a/mlpf/pytorch_pipeline.py
+++ b/mlpf/pytorch_pipeline.py
@@ -67,21 +67,21 @@ def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_si
 
 if __name__ == "__main__":
 
-    # args = pytorch_delphes.parse_args()
-
-    # the next part initializes some args values (to run the script not from terminal)
-    class objectview(object):
-        def __init__(self, d):
-            self.__dict__ = d
-
-    args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 2, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
-    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
-    'outpath': '../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
-    'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
-    'load': False, 'load_epoch': 1, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_2_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
-    'make_predictions_train': False, 'make_plots_train': False, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True,
-    'optimized': False})
+    args = pytorch_delphes.parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 2, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
+    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
+    # 'load': False, 'load_epoch': 1, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_2_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
+    # 'make_predictions_train': False, 'make_plots_train': False, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True,
+    # 'optimized': False})
 
     # define the dataset (assumes the data exists as .pt files in "processed")
     print('Processing the data..')

From 7b64bead3d1c58cfeffd28314a2728a6337efdb7 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Mon, 23 Aug 2021 15:07:35 -0700
Subject: [PATCH 084/157] fix github check/build

---
 mlpf/pytorch_delphes/graph_data_delphes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
index 93a533b00..b4e168465 100644
--- a/mlpf/pytorch_delphes/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -10,6 +10,8 @@
 import pickle
 import multiprocessing
 
+import pytorch_delphes
+
 # assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
 # they are processed and saved as pt files in /test_tmp_delphes/data/pythia8_ttbar/processed
 # PFGraphDataset -> returns for 1 event: Data(x=[5139, 12], ycand=[5139, 6], ycand_id=[5139, 6], ygen=[5139, 6], ygen_id=[5139, 6])
@@ -142,7 +144,7 @@ def __getitem__(self, idx):
 
 if __name__ == "__main__":
 
-    args = parse_args()
+    args = pytorch_delphes.parse_args()
 
     pfgraphdataset = PFGraphDataset(root=args.dataset)
 

From f13718f699a266d6cf4ad420a21867b412e860ab Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 24 Aug 2021 15:38:12 +0200
Subject: [PATCH 085/157] feat: log nvidia-smi info to csv file

---
 mlpf/flatiron/raytune.sh      |  6 +++---
 mlpf/flatiron/start-head.sh   |  2 ++
 mlpf/flatiron/start-worker.sh |  2 ++
 mlpf/pipeline.py              |  8 ++++++++
 mlpf/tfmodel/utils.py         | 22 ++++++++++++----------
 parameters/cms-gnn-dense.yaml |  2 +-
 6 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
index 06ab48580..e6abd033f 100755
--- a/mlpf/flatiron/raytune.sh
+++ b/mlpf/flatiron/raytune.sh
@@ -6,7 +6,7 @@
 #SBATCH -p gpu
 #SBATCH --constraint=a100,sxm4
 #SBATCH --gpus-per-task=4
-#SBATCH --cpus-per-task=16
+#SBATCH --cpus-per-task=64
 
 # Job name
 #SBATCH -J raytune
@@ -48,7 +48,7 @@ export ip_head
 echo "IP Head: $ip_head"
 
 echo "STARTING HEAD at $node_1"
-srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip &
+srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip $SLURM_JOB_ID $2 &
 sleep 30
 
 worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
@@ -56,7 +56,7 @@ for ((  i=1; i<=$worker_num; i++ ))
 do
   node_i=${nodes_array[$i]}
   echo "STARTING WORKER $i at $node_i"
-  srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head &
+  srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head $SLURM_JOB_ID $i $2 &
   sleep 5
 done
 ##############################################################################################
diff --git a/mlpf/flatiron/start-head.sh b/mlpf/flatiron/start-head.sh
index 59f8cdc24..0af43132c 100755
--- a/mlpf/flatiron/start-head.sh
+++ b/mlpf/flatiron/start-head.sh
@@ -5,5 +5,7 @@ export LANG=C.UTF-8
 
 echo "starting ray head node"
 # Launch the head node
+mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2"
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" &
 ray start --head --node-ip-address=$1 --port=6379
 sleep infinity
diff --git a/mlpf/flatiron/start-worker.sh b/mlpf/flatiron/start-worker.sh
index ce7a6d009..f90bdb2c0 100755
--- a/mlpf/flatiron/start-worker.sh
+++ b/mlpf/flatiron/start-worker.sh
@@ -4,5 +4,7 @@ export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
 
 echo "starting ray worker node"
+mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2"
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" &
 ray start --address $1
 sleep infinity
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ff0d94172..7a158a8b9 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -12,6 +12,8 @@
 from tqdm import tqdm
 import shutil
 from functools import partial
+import shlex
+import subprocess
 
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
@@ -95,6 +97,9 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+    if "CPU" not in strategy.extended.worker_devices[0]:
+        nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
+        p = subprocess.Popen(shlex.split(nvidia_smi_call))
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
@@ -182,6 +187,9 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
         freeze_model(model, config, outdir)
 
+    if "CPU" not in strategy.extended.worker_devices[0]:
+        p.terminate()
+
 
 @main.command()
 @click.help_option("-h", "--help")
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 5830cb374..baf00952b 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -89,17 +89,19 @@ def delete_all_but_best_checkpoint(train_dir, dry_run):
 
 
 def get_strategy(global_batch_size):
-    try:
-        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
+    gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "-1").split(",")]
+    if gpus[0] == -1:
+        num_gpus = 0
+    else:
         num_gpus = len(gpus)
-        print("num_gpus=", num_gpus)
-        if num_gpus > 1:
-            strategy = tf.distribute.MirroredStrategy()
-            global_batch_size = num_gpus * global_batch_size
-        else:
-            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
-    except Exception as e:
-        print("fallback to CPU", e)
+    print("num_gpus=", num_gpus)
+    if num_gpus > 1:
+        strategy = tf.distribute.MirroredStrategy()
+        global_batch_size = num_gpus * global_batch_size
+    elif num_gpus == 1:
+        strategy = tf.distribute.OneDeviceStrategy("gpu:0")
+    elif num_gpus == 0:
+        print("fallback to CPU")
         strategy = tf.distribute.OneDeviceStrategy("cpu")
         num_gpus = 0
     return strategy, global_batch_size
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 7a9d53ad6..e691827dd 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -126,7 +126,7 @@ hypertune:
 
 raytune:
   local_dir:
-  sched: "asha"
+  sched: "asha"  # asha, hyperband
   parameters:
     # optimizer parameters
     lr: [1e-4]

From ce899c36c3220593c0d28207836cb4dd68aca986 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 24 Aug 2021 15:38:45 +0200
Subject: [PATCH 086/157] feat: plot GPU util from nvidia-smi csv-file

---
 scripts/plot_nvidiasmi_csv.py | 56 +++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 scripts/plot_nvidiasmi_csv.py

diff --git a/scripts/plot_nvidiasmi_csv.py b/scripts/plot_nvidiasmi_csv.py
new file mode 100644
index 000000000..a6570f15b
--- /dev/null
+++ b/scripts/plot_nvidiasmi_csv.py
@@ -0,0 +1,56 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+from pathlib import Path
+import numpy as np
+from datetime import datetime
+import time
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dir", type=str, default="parameters/delphes-gnn-skipconn.yaml", help="dir containing csv files")
+    args = parser.parse_args()
+    return args
+
+
+def plot_gpu_util(df, cuda_device):
+    plt.figure(figsize=(12,9))
+    plt.plot(df["time"], df["GPU{}_util".format(cuda_device)], alpha=0.8)
+    plt.xlabel("Time [s]")
+    plt.ylabel("GPU utilization [%]")
+    plt.title("GPU{}".format(cuda_device))
+    plt.grid(alpha=0.3)
+
+
+def plot_gpu_util(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_util".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("GPU utilization [%]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    csv_files = list(Path(args.dir).glob("*.csv"))
+
+    for file in csv_files:
+        print(file)
+        df = pd.read_csv(str(file))
+        start_time = df["timestamp"].iloc[0]
+        start_t = datetime.strptime(start_time, "%Y/%m/%d %H:%M:%S.%f").timestamp()
+        dfs = []
+        for ii, gpu in enumerate(np.unique(df[" pci.bus_id"].values)):
+            dfs.append(pd.DataFrame({
+                "GPU{}_util".format(ii): df[df[" pci.bus_id"] == " 00000000:31:00.0"][" utilization.gpu [%]"].map(lambda x: int(x.split(" ")[1])),
+                "time": df[df[" pci.bus_id"] == " 00000000:31:00.0"]["timestamp"].map(lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t),
+            }).dropna())
+    
+        fig, axs = plt.subplots(2, 2, figsize=(12,9), tight_layout=True)
+        for ax in axs.flat:
+            ax.label_outer()
+
+        for cuda_device, (df, ax) in enumerate(zip(dfs, axs.flat)):
+            plot_gpu_util(df, cuda_device, ax)
+        plt.suptitle("{}".format(file.stem))
+        plt.savefig(args.dir + "/{}_gpu_util.jpg".format(file.stem))
\ No newline at end of file

From 4adbf3ac581518f3374abe45cf5dc67b22cf96f7 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 24 Aug 2021 16:45:36 +0300
Subject: [PATCH 087/157] readd comet

---
 mlpf/pipeline.py            | 29 +++++++++++++++++---
 mlpf/tfmodel/model.py       | 31 +++++++++++++---------
 mlpf/tfmodel/model_setup.py | 53 +++++++++++++++++++++++--------------
 mlpf/tfmodel/utils.py       |  6 ++---
 parameters/cms.yaml         |  4 ++-
 5 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 086142873..1f6fcdd38 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -1,3 +1,4 @@
+import comet_ml
 import sys
 import os
 import yaml
@@ -91,6 +92,22 @@ def data(config):
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
 @click.option("--plot-freq", default=1, help="Plot detailed validation every N epochs", type=int)
 def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
+
+    try:
+        from comet_ml import Experiment
+        experiment = Experiment(
+            project_name="particleflow-tf",
+            auto_metric_logging=True,
+            auto_param_logging=True,
+            auto_histogram_weight_logging=True,
+            auto_histogram_gradient_logging=False,
+            auto_histogram_activation_logging=False,
+        )
+    except Exception as e:
+        print("Failed to initialize comet-ml dashboard")
+        experiment = None
+
+
     """Train a model defined by config"""
     config_file_path = config
     config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
@@ -111,6 +128,11 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
         outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
     else:
         outdir = str(Path(weights).parent)
+    if experiment:
+        experiment.set_name(outdir)
+        experiment.log_code("mlpf/tfmodel/model.py")
+        experiment.log_code("mlpf/tfmodel/utils.py")
+
     shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
     total_steps = n_epochs * n_train // global_batch_size
@@ -142,7 +164,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
             initial_epoch = int(weights.split("/")[-1].split("-")[1])
         model(tf.cast(X_val[:1], model_dtype))
 
-        config = set_config_loss(config, config["setup"]["trainable"])
+        #config = set_config_loss(config, config["setup"]["trainable"])
         configure_model_weights(model, config["setup"]["trainable"])
         model(tf.cast(X_val[:1], model_dtype))
 
@@ -179,7 +201,8 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
         dataset_transform,
         config["dataset"]["num_output_classes"],
         dataset_def,
-        plot_freq
+        plot_freq,
+        experiment
     )
     callbacks.append(optim_callbacks)
 
@@ -292,7 +315,7 @@ def find_lr(config, outdir, figname, logscale):
             model_dtype = tf.dtypes.float32
 
         model = make_model(config, model_dtype)
-        config = set_config_loss(config, config["setup"]["trainable"])
+        #config = set_config_loss(config, config["setup"]["trainable"])
 
         # Run model once to build the layers
         model(tf.cast(X_val[:1], model_dtype))
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 96954438e..963de07d7 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -232,8 +232,13 @@ def call(self, inputs):
         return self.activation(self.ffn(x2))
 
 def point_wise_feed_forward_network(d_model, dff, name, num_layers=1, activation='elu', dtype=tf.dtypes.float32, dim_decrease=False, dropout=0.0):
-    bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
-    kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
+
+    if regularizer_weight > 0:
+        bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
+        kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
+    else:
+        bias_regularizer = None
+        kernel_regularizer = None
 
     layers = []
     for ilayer in range(num_layers):
@@ -370,7 +375,7 @@ def call(self, x_msg, x_node, msk):
         return bins_split, x_features_binned, dm, msk_f_binned
 
 
-class OutputDecoding(tf.keras.layers.Layer):
+class OutputDecoding(tf.keras.Model):
     def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout, **kwargs):
         super(OutputDecoding, self).__init__(**kwargs)
 
@@ -417,7 +422,7 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
 
         self.ffn_energy = point_wise_feed_forward_network(
             2, hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True,
             dropout=dropout
         )
 
@@ -447,12 +452,12 @@ def call(self, X_input, X_encoded, msk_input):
             orig_energy = X_input[:, :, 5:6]
 
         if self.regression_use_classification:
-            X_encoded = tf.concat([X_encoded, out_id_logits], axis=-1)
+            X_encoded = tf.concat([X_encoded, out_id_softmax], axis=-1)
 
-        pred_eta_corr = self.ffn_eta(X_encoded)
-        pred_phi_corr = self.ffn_phi(X_encoded)
-        pred_energy_corr = self.ffn_energy(X_encoded)
-        pred_pt_corr = self.ffn_pt(X_encoded)
+        pred_eta_corr = self.ffn_eta(X_encoded)*msk_input
+        pred_phi_corr = self.ffn_phi(X_encoded)*msk_input
+        pred_energy_corr = self.ffn_energy(X_encoded)*msk_input
+        pred_pt_corr = self.ffn_pt(X_encoded)*msk_input
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
         pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
@@ -462,12 +467,12 @@ def call(self, X_input, X_encoded, msk_input):
         pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
         pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
 
-        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-        pred_energy = orig_energy*energy_sigmoid + (1.0 - energy_sigmoid)*tf.exp(tf.clip_by_value(pred_energy_corr[:, :, 1:2], -6, 6))
+        #energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        pred_energy = pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]*orig_energy
         
-        orig_pt = tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
+        orig_pt = tf.stop_gradient(pred_energy - tf.math.log(tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8))))
         pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        pred_pt = orig_pt*pt_sigmoid + (1.0 - pt_sigmoid)*tf.exp(tf.clip_by_value(pred_pt_corr[:, :, 1:2], -6, 6))
+        pred_pt = orig_pt + pt_sigmoid*pred_pt_corr[:, :, 1:2]
 
         ret = {
             "cls": out_id_softmax,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 17f1a723b..c3ccde269 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -59,16 +59,18 @@ def plot_to_image(figure):
     return image
 
 class CustomCallback(tf.keras.callbacks.Callback):
-    def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_classes, plot_freq=1):
+    def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_classes, plot_freq=1, comet_experiment=None):
         super(CustomCallback, self).__init__()
         self.X = X
         self.y = y
         self.plot_freq = plot_freq
+        self.comet_experiment = comet_experiment
 
         self.dataset_def = dataset_def
 
         #transform the prediction target from an array into a dictionary for easier access
         self.ytrue = dataset_transform(self.X, self.y, None)[1]
+        self.ytrue = {k: np.array(v) for k, v in self.ytrue.items()}
         self.ytrue_id = np.argmax(self.ytrue["cls"], axis=-1)
 
         self.outpath = outpath
@@ -97,7 +99,7 @@ def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_cla
             "energy": np.linspace(0,1000,100),
         }
 
-    def plot_cm(self, outpath, ypred_id, msk):
+    def plot_cm(self, epoch, outpath, ypred_id, msk):
 
         ytrue_id_flat = self.ytrue_id[msk].astype(np.int64).flatten()
         ypred_id_flat = ypred_id[msk].flatten()
@@ -106,6 +108,11 @@ def plot_cm(self, outpath, ypred_id, msk):
             ytrue_id_flat,
             ypred_id_flat, labels=list(range(self.num_output_classes)), normalize="true"
         )
+        if self.comet_experiment:
+            self.comet_experiment.log_confusion_matrix(
+                file_name="confusion-matrix-epoch{}.json".format(epoch), matrix=cm, epoch=epoch
+            )
+
         figure = plot_confusion_matrix(cm)
 
         acc = sklearn.metrics.accuracy_score(
@@ -144,7 +151,7 @@ def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
         sphi = ypred["sin_phi"][ievent][msk]
         cphi = ypred["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = ypred["energy"][ievent][msk]
+        energy = np.exp(np.clip(ypred["energy"][ievent][msk], -6, 6)) - 1.0
         pdgid = ypred_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
@@ -158,7 +165,7 @@ def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
         sphi = self.ytrue["sin_phi"][ievent][msk]
         cphi = self.ytrue["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = self.ytrue["energy"][ievent][msk]
+        energy = np.exp(np.clip(self.ytrue["energy"][ievent][msk], -6, 6)) - 1.0
         pdgid = self.ytrue_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
@@ -203,7 +210,7 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
 
         #FIXME: propagate from configuration
         if reg_variable == "energy" or reg_variable == "pt":
-            delta = 1.0
+            delta = 0.1
         else:
             delta = 0.1
             
@@ -217,7 +224,7 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
             vals_true = np.log(vals_true)
             s = "_log"
 
-        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=loss_vals)
+        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=(2.0 +loss_vals))
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
@@ -229,7 +236,11 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         plt.xlabel("predicted")
         plt.ylabel("true")
         plt.title("{}, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
-        plt.savefig(str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s)), bbox_inches="tight")
+        image_path = str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s))
+        plt.savefig(image_path, bbox_inches="tight")
+
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
         plt.close("all")
 
         #Also plot the residuals, as we have the true and predicted values already available here
@@ -240,12 +251,16 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         plt.hist(residual, bins=100)
         plt.xlabel("true - pred")
         plt.title("{} residual, m={:.4f} s={:.4f}".format(reg_variable, np.mean(residual), np.std(residual)))
-        plt.savefig(str(outpath / "{}_residual{}.png".format(reg_variable, s)), bbox_inches="tight")
+
+        image_path = str(outpath / "{}{}_cls{}_residual.png".format(reg_variable, s, icls))
+        plt.savefig(image_path, bbox_inches="tight")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
         plt.close("all")
 
-        # FIXME: for some reason, these don't end up on the tensorboard
-        # tf.summary.scalar('residual_{}{}_mean'.format(reg_variable, s), data=np.mean(residual), step=epoch)
-        # tf.summary.scalar('residual_{}{}_std'.format(reg_variable, s), data=np.std(residual), step=epoch)
+        if self.comet_experiment:
+            self.comet_experiment.log_metric('residual_{}{}_cls{}_mean'.format(reg_variable, s, icls), np.mean(residual), step=epoch)
+            self.comet_experiment.log_metric('residual_{}{}_cls{}_std'.format(reg_variable, s, icls), np.std(residual), step=epoch)
 
     def on_epoch_end(self, epoch, logs=None):
 
@@ -269,7 +284,7 @@ def on_epoch_end(self, epoch, logs=None):
         #exclude padded elements from the plotting
         msk = self.X[:, :, 0] != 0
 
-        self.plot_cm(cp_dir, ypred_id, msk)
+        self.plot_cm(epoch, cp_dir, ypred_id, msk)
         for ievent in range(min(5, self.X.shape[0])):
             self.plot_event_visualization(cp_dir, ypred, ypred_id, msk, ievent=ievent)
 
@@ -279,12 +294,12 @@ def on_epoch_end(self, epoch, logs=None):
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
                 self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
                 self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, variable)
-            self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
-            self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "pt", log=True)
+            #self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
+            #self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "pt", log=True)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
-def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes, dataset_def, plot_freq=1):
+def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes, dataset_def, plot_freq=1, comet_experiment=None):
     callbacks = []
     tb = CustomTensorBoard(
         log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
@@ -311,7 +326,7 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     history_path = Path(outdir) / "history"
     history_path.mkdir(parents=True, exist_ok=True)
     history_path = str(history_path)
-    cb = CustomCallback(dataset_def, history_path, X_val, y_val, dataset_transform, num_output_classes, plot_freq=plot_freq)
+    cb = CustomCallback(dataset_def, history_path, X_val, y_val, dataset_transform, num_output_classes, plot_freq=plot_freq, comet_experiment=comet_experiment)
     cb.set_model(model)
 
     callbacks += [cb]
@@ -497,14 +512,12 @@ def on_epoch_end(self, epoch, numpy_logs):
 
 def configure_model_weights(model, trainable_layers):
     print("setting trainable layers: {}".format(trainable_layers))
+
     if (trainable_layers is None):
         trainable_layers = "all"
+
     if trainable_layers == "all":
         model.trainable = True
-    elif trainable_layers == "classification":
-        model.set_trainable_classification()
-    elif trainable_layers == "regression":
-        model.set_trainable_regression()
     else:
         if isinstance(trainable_layers, str):
             trainable_layers = [trainable_layers]
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 5b0b65331..b3733e537 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -153,7 +153,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
@@ -183,11 +183,11 @@ def func(X, y, w):
             {
                 "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),
                 "charge": y[:, :, 1:2],
-                "pt": y[:, :, 2:3],
+                "pt": tf.math.log(y[:, :, 2:3] + 1.0),
                 "eta": y[:, :, 3:4],
                 "sin_phi": y[:, :, 4:5],
                 "cos_phi": y[:, :, 5:6],
-                "energy": y[:, :, 6:7],
+                "energy": tf.math.log(y[:, :, 6:7] + 1.0),
             },
             w,
         )
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 6d4e1bb4d..de6fa5f5f 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -36,8 +36,10 @@ dataset:
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
     type: Huber
+    delta: 0.1
   pt_loss:
     type: Huber
+    delta: 0.1
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -83,7 +85,7 @@ parameters:
   hidden_dim: 256
   bin_size: 320
   distance_dim: 128
-  dropout: 0.0
+  dropout: 0.2
   graph_kernel:
     type: NodePairGaussianKernel
     dist_mult: 0.1

From bea6599ca50012f1738a867616a019619405491b Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 24 Aug 2021 16:45:49 +0300
Subject: [PATCH 088/157] updat epochs

---
 parameters/cms.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index de6fa5f5f..f4ee7407f 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -61,7 +61,7 @@ setup:
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 50
+  num_epochs: 100
   num_val_files: 10
   dtype: float32
   trainable:

From 160db531c7d2f2772621451bfa99b1e456c02119 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 24 Aug 2021 17:06:37 +0300
Subject: [PATCH 089/157] comet optional

---
 mlpf/pipeline.py        | 7 ++++++-
 parameters/cms-dev.yaml | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 1f6fcdd38..87f9f9b5e 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -1,4 +1,8 @@
-import comet_ml
+try:
+    import comet_ml
+except ModuleNotFoundError as e:
+    print("comet_ml not found, ignoring")
+
 import sys
 import os
 import yaml
@@ -132,6 +136,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
         experiment.set_name(outdir)
         experiment.log_code("mlpf/tfmodel/model.py")
         experiment.log_code("mlpf/tfmodel/utils.py")
+        experiment.log_code(config_file_path)
 
     shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 35331e008..579ba3282 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -36,8 +36,10 @@ dataset:
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
     type: Huber
+    delta: 0.1
   pt_loss:
     type: Huber
+    delta: 0.1
   sin_phi_loss:
     type: Huber
     delta: 0.1

From 6671735d66d250b489be67cc88cbf330ee61e602 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 24 Aug 2021 17:22:16 +0300
Subject: [PATCH 090/157] fix model saving

---
 mlpf/tfmodel/model.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 963de07d7..1e8fea053 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -432,11 +432,13 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
     X_encoded_reg: (n_batch, n_elements, n_encoded_features)
     msk_input: (n_batch, n_elements) boolean mask
     """
-    def call(self, X_input, X_encoded, msk_input):
+    def call(self, args, training=False):
 
-        out_id_logits = self.ffn_id(X_encoded)*msk_input
+        X_input, X_encoded, msk_input = args
+
+        out_id_logits = self.ffn_id(X_encoded, training)*msk_input
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_charge = self.ffn_charge(X_encoded)*msk_input
+        out_charge = self.ffn_charge(X_encoded, training)*msk_input
 
         #orig_pt = X_input[:, :, 1:2]
         orig_eta = X_input[:, :, 2:3]
@@ -454,10 +456,10 @@ def call(self, X_input, X_encoded, msk_input):
         if self.regression_use_classification:
             X_encoded = tf.concat([X_encoded, out_id_softmax], axis=-1)
 
-        pred_eta_corr = self.ffn_eta(X_encoded)*msk_input
-        pred_phi_corr = self.ffn_phi(X_encoded)*msk_input
-        pred_energy_corr = self.ffn_energy(X_encoded)*msk_input
-        pred_pt_corr = self.ffn_pt(X_encoded)*msk_input
+        pred_eta_corr = self.ffn_eta(X_encoded, training)*msk_input
+        pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input
+        pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
+        pred_pt_corr = self.ffn_pt(X_encoded, training)*msk_input
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
         pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
@@ -639,7 +641,7 @@ def call(self, inputs, training=False):
         if self.debug:
             debugging_data["dec_output"] = dec_output
 
-        ret = self.output_dec(X, dec_output, msk_input)
+        ret = self.output_dec([X, dec_output, msk_input], training)
 
         if self.debug:
             for k in debugging_data.keys():

From bedebfa488fe553d97b7805482b18897be509b8d Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 24 Aug 2021 16:48:05 +0200
Subject: [PATCH 091/157] fix: bug in plotting of GPU util

---
 scripts/plot_nvidiasmi_csv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/plot_nvidiasmi_csv.py b/scripts/plot_nvidiasmi_csv.py
index a6570f15b..d539ffb75 100644
--- a/scripts/plot_nvidiasmi_csv.py
+++ b/scripts/plot_nvidiasmi_csv.py
@@ -42,8 +42,8 @@ def plot_gpu_util(df, cuda_device, ax):
         dfs = []
         for ii, gpu in enumerate(np.unique(df[" pci.bus_id"].values)):
             dfs.append(pd.DataFrame({
-                "GPU{}_util".format(ii): df[df[" pci.bus_id"] == " 00000000:31:00.0"][" utilization.gpu [%]"].map(lambda x: int(x.split(" ")[1])),
-                "time": df[df[" pci.bus_id"] == " 00000000:31:00.0"]["timestamp"].map(lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t),
+                "GPU{}_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.gpu [%]"].map(lambda x: int(x.split(" ")[1])),
+                "time": df[df[" pci.bus_id"] == gpu]["timestamp"].map(lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t),
             }).dropna())
     
         fig, axs = plt.subplots(2, 2, figsize=(12,9), tight_layout=True)

From e2a9878dd5e19aadeee8c676adde46163572efe1 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 24 Aug 2021 22:34:09 +0300
Subject: [PATCH 092/157] update delphes

---
 parameters/delphes.yaml | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index a679ce39e..a79800c95 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -23,9 +23,11 @@ dataset:
   num_files_per_chunk: 5
   validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
   energy_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
+    delta: 0.1
   pt_loss:
-    type: MeanSquaredLogarithmicError
+    type: Huber
+    delta: 0.1
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -67,21 +69,21 @@ parameters:
   model: gnn_dense
   input_encoding: default
   activation: elu
-  layernorm: no
+  layernorm: yes
   hidden_dim: 256
-  bin_size: 640
+  bin_size: 320
   distance_dim: 128
   dropout: 0.0
   graph_kernel:
     type: NodePairGaussianKernel
     dist_mult: 0.1
-    clip_value_low: 0.1
-  num_graph_layers: 2
+    clip_value_low: 0.0
+  num_graph_layers: 6
   node_message:
     type: GHConvDense
-    output_dim: 128
+    output_dim: 256
     activation: elu
-  num_node_messages: 2
+  num_node_messages: 1
   skip_connection: yes
   regression_use_classification: yes
   debug: no

From 14a0003d36e6d2c2558a3e2fda42396019b05e2d Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 24 Aug 2021 22:35:04 +0300
Subject: [PATCH 093/157] up

---
 parameters/cms.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index f4ee7407f..dab00e039 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -57,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-3
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
@@ -85,7 +85,7 @@ parameters:
   hidden_dim: 256
   bin_size: 320
   distance_dim: 128
-  dropout: 0.2
+  dropout: 0.0
   graph_kernel:
     type: NodePairGaussianKernel
     dist_mult: 0.1

From 2598800e87c25dbd0e6b4758e1cf5021a8067f51 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 25 Aug 2021 11:18:42 +0300
Subject: [PATCH 094/157] work on improving energy regression

---
 mlpf/tfmodel/model.py       | 22 ++++++++-------
 mlpf/tfmodel/model_setup.py | 53 +++++++++++++++++++++++++++----------
 parameters/cms-dev.yaml     | 10 +++----
 parameters/cms.yaml         | 10 +++----
 4 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 1e8fea053..2ed546721 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -384,7 +384,7 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         self.dropout = dropout
 
         self.ffn_id = point_wise_feed_forward_network(
-            num_output_classes, hidden_dim,
+            num_output_classes, hidden_dim*4,
             "ffn_cls",
             dtype=tf.dtypes.float32,
             num_layers=4,
@@ -403,7 +403,7 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         )
         
         self.ffn_pt = point_wise_feed_forward_network(
-            2, hidden_dim, "ffn_pt",
+            4, hidden_dim, "ffn_pt",
             dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
             dropout=dropout
         )
@@ -421,7 +421,7 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         )
 
         self.ffn_energy = point_wise_feed_forward_network(
-            2, hidden_dim, "ffn_energy",
+            4, hidden_dim*4, "ffn_energy",
             dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True,
             dropout=dropout
         )
@@ -458,8 +458,6 @@ def call(self, args, training=False):
 
         pred_eta_corr = self.ffn_eta(X_encoded, training)*msk_input
         pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input
-        pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
-        pred_pt_corr = self.ffn_pt(X_encoded, training)*msk_input
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
         pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
@@ -469,12 +467,18 @@ def call(self, args, training=False):
         pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
         pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
 
-        #energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-        pred_energy = pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]*orig_energy
+        X_encoded = tf.concat([X_encoded, tf.stop_gradient(pred_eta)], axis=-1)
+        pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
+        pred_pt_corr = self.ffn_pt(X_encoded, training)*msk_input
+
+        energy_sigmoid1 = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        energy_sigmoid2 = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 1:2])
+        pred_energy = orig_energy*(1.0 + energy_sigmoid1*pred_energy_corr[:, :, 2:3]) + energy_sigmoid2*pred_energy_corr[:, :, 3:4]
         
         orig_pt = tf.stop_gradient(pred_energy - tf.math.log(tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8))))
-        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        pred_pt = orig_pt + pt_sigmoid*pred_pt_corr[:, :, 1:2]
+        pt_sigmoid1 = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+        pt_sigmoid2 = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 1:2])
+        pred_pt = orig_pt*(1.0 + pt_sigmoid1*pred_pt_corr[:, :, 2:3]) + pt_sigmoid2*pred_pt_corr[:, :, 3:4]
 
         ret = {
             "cls": out_id_softmax,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index c3ccde269..0d39e0009 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -92,11 +92,11 @@ def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_cla
         }
 
         self.reg_bins = {
-            "pt": np.linspace(0, 50, 100),
-            "eta": np.linspace(-5, 5, 100),
+            "pt": np.linspace(-4, 8, 100),
+            "eta": np.linspace(-8, 8, 100),
             "sin_phi": np.linspace(-1,1,100),
             "cos_phi": np.linspace(-1,1,100),
-            "energy": np.linspace(0,1000,100),
+            "energy": np.linspace(-1, 10,100),
         }
 
     def plot_cm(self, epoch, outpath, ypred_id, msk):
@@ -124,10 +124,14 @@ def plot_cm(self, epoch, outpath, ypred_id, msk):
             ypred_id_flat
         )
         plt.title("acc={:.3f} bacc={:.3f}".format(acc, balanced_acc))
-        plt.savefig(str(outpath / "cm_normed.pdf"), bbox_inches="tight")
+
+        image_path = str(outpath / "cm_normed.png")
+        plt.savefig(image_path, bbox_inches="tight")
         plt.close("all")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
 
-    def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
+    def plot_event_visualization(self, epoch, outpath, ypred, ypred_id, msk, ievent=0):
 
         X_eta, X_phi, X_energy = self.dataset_def.get_X_eta_phi_energy(self.X)
 
@@ -171,8 +175,11 @@ def plot_event_visualization(self, outpath, ypred, ypred_id, msk, ievent=0):
         plt.xlim(-8,8)
         plt.ylim(-4,4)
 
-        plt.savefig(str(outpath / "event_iev{}.png".format(ievent)), bbox_inches="tight")
+        image_path = str(outpath / "event_iev{}.png".format(ievent))
+        plt.savefig(image_path, bbox_inches="tight")
         plt.close("all")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
 
     def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
 
@@ -210,7 +217,7 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
 
         #FIXME: propagate from configuration
         if reg_variable == "energy" or reg_variable == "pt":
-            delta = 0.1
+            delta = 1.0
         else:
             delta = 0.1
             
@@ -224,25 +231,42 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
             vals_true = np.log(vals_true)
             s = "_log"
 
-        plt.scatter(vals_pred, vals_true, marker=".", alpha=0.8, s=(2.0 +loss_vals))
+        #save correlation histogram
+        plt.figure()
+        bins = self.reg_bins[reg_variable]
+        plt.hist2d(vals_pred, vals_true, bins=(bins, bins), cmap="Blues")
+        plt.colorbar()
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
             if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
                 plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
-                plt.xlim(minval, maxval)
-                plt.ylim(minval, maxval)
-
         plt.xlabel("predicted")
         plt.ylabel("true")
-        plt.title("{}, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
+        plt.title("{}, particle weighted, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
         image_path = str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s))
         plt.savefig(image_path, bbox_inches="tight")
-
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
         plt.close("all")
 
+        #save loss-weighted correlation histogram
+        plt.figure()
+        plt.hist2d(vals_pred, vals_true, bins=(bins, bins), weights=loss_vals, cmap="Blues")
+        plt.colorbar()
+        if len(vals_true) > 0:
+            minval = np.min(vals_true)
+            maxval = np.max(vals_true)
+            if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
+                plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
+        plt.xlabel("predicted")
+        plt.ylabel("true")
+        plt.title("{}, loss weighted, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
+        image_path = str(outpath / "{}_cls{}_corr{}_weighted.png".format(reg_variable, icls, s))
+        plt.savefig(image_path, bbox_inches="tight")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
         #Also plot the residuals, as we have the true and predicted values already available here
         plt.figure()
         residual = vals_true - vals_pred
@@ -261,6 +285,7 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         if self.comet_experiment:
             self.comet_experiment.log_metric('residual_{}{}_cls{}_mean'.format(reg_variable, s, icls), np.mean(residual), step=epoch)
             self.comet_experiment.log_metric('residual_{}{}_cls{}_std'.format(reg_variable, s, icls), np.std(residual), step=epoch)
+            self.comet_experiment.log_metric('val_loss_{}{}_cls{}'.format(reg_variable, s, icls), np.sum(loss_vals), step=epoch)
 
     def on_epoch_end(self, epoch, logs=None):
 
@@ -286,7 +311,7 @@ def on_epoch_end(self, epoch, logs=None):
 
         self.plot_cm(epoch, cp_dir, ypred_id, msk)
         for ievent in range(min(5, self.X.shape[0])):
-            self.plot_event_visualization(cp_dir, ypred, ypred_id, msk, ievent=ievent)
+            self.plot_event_visualization(epoch, cp_dir, ypred, ypred_id, msk, ievent=ievent)
 
         for icls in range(self.num_output_classes):
             cp_dir_cls = cp_dir / "cls_{}".format(icls)
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 579ba3282..d3cb162ab 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -36,10 +36,10 @@ dataset:
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
     type: Huber
-    delta: 0.1
+    delta: 1.0
   pt_loss:
     type: Huber
-    delta: 0.1
+    delta: 1.0
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -57,11 +57,11 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-5
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 50
+  num_epochs: 100
   num_val_files: 10
   dtype: float32
   trainable:
@@ -81,7 +81,7 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   activation: elu
-  layernorm: yes
+  layernorm: no
   hidden_dim: 256
   bin_size: 32
   distance_dim: 16
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index dab00e039..3d8689110 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -36,10 +36,10 @@ dataset:
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
   energy_loss:
     type: Huber
-    delta: 0.1
+    delta: 1.0
   pt_loss:
     type: Huber
-    delta: 0.1
+    delta: 1.0
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -57,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-5
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
@@ -81,7 +81,7 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   activation: elu
-  layernorm: yes
+  layernorm: no
   hidden_dim: 256
   bin_size: 320
   distance_dim: 128
@@ -94,7 +94,7 @@ parameters:
   node_message:
     type: GHConvDense
     output_dim: 256
-    activation: elu
+    activation: gelu
     normalize_degrees: yes
   num_node_messages: 1
   skip_connection: yes

From 17b547f14e891eebc00f274fbfbad28e202a2dc8 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 25 Aug 2021 13:20:01 +0300
Subject: [PATCH 095/157] added customization function

---
 mlpf/pipeline.py      | 35 +++++++++++++++++++++++++++++++++--
 mlpf/tfmodel/model.py |  2 +-
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 87f9f9b5e..c97904b8c 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -65,9 +65,14 @@ def main():
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
-def data(config):
+@click.option("--customize", help="customization function", type=str, default=None)
+def data(config, customize):
 
     config, _, _, _, _, _, _ = parse_config(config)
+
+    if customize:
+        config = customization_functions[customize](config)
+
     cds = config["dataset"]
 
     dataset_def = Dataset(
@@ -95,7 +100,8 @@ def data(config):
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
 @click.option("--plot-freq", default=1, help="Plot detailed validation every N epochs", type=int)
-def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
+@click.option("--customize", help="customization function", type=str, default=None)
+def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq, customize):
 
     try:
         from comet_ml import Experiment
@@ -118,6 +124,10 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
         config, ntrain, ntest, weights
     )
 
+    if customize:
+        prefix += customize
+        config = customization_functions[customize](config)
+
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     # If using more than 1 GPU, we scale the batch size by the number of GPUs before the dataset is loaded
@@ -126,6 +136,10 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq):
 
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
+
+    #FIXME: split up training/test and validation dataset and parameters
+    dataset_def.padded_num_elem_size = 6400
+
     X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
     if recreate or (weights is None):
@@ -356,6 +370,23 @@ def find_lr(config, outdir, figname, logscale):
         lr_finder.plot(save_dir=outdir, figname=figname, log_scale=logscale)
 
 
+def customize_gun_sample(config):
+    config["dataset"]["padded_num_elem_size"] = 640
+    config["dataset"]["processed_path"] = "data/SinglePiFlatPt0p7To10_cfi/tfr_cand/*.tfrecords"
+    config["dataset"]["raw_path"] = "data/SinglePiFlatPt0p7To10_cfi/raw/*.pkl.bz2"
+    config["dataset"]["classification_loss_coef"] = 0.0
+    config["dataset"]["charge_loss_coef"] = 0.0
+    config["dataset"]["eta_loss_coef"] = 0.0
+    config["dataset"]["sin_phi_loss_coef"] = 0.0
+    config["dataset"]["cos_phi_loss_coef"] = 0.0
+    config["setup"]["trainable"] = "ffn_energy"
+    config["setup"]["batch_size"] = 10*config["setup"]["batch_size"]
+    return config
+
+customization_functions = {
+    "gun_sample": customize_gun_sample
+}
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-t", "--train_dir", help="training directory", type=click.Path())
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 2ed546721..6c0153b70 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -454,7 +454,7 @@ def call(self, args, training=False):
             orig_energy = X_input[:, :, 5:6]
 
         if self.regression_use_classification:
-            X_encoded = tf.concat([X_encoded, out_id_softmax], axis=-1)
+            X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_softmax)], axis=-1)
 
         pred_eta_corr = self.ffn_eta(X_encoded, training)*msk_input
         pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input

From 071c1ff20048b50aaf567046a95d0def8b43de01 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 25 Aug 2021 13:21:57 +0300
Subject: [PATCH 096/157] better name

---
 mlpf/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index c97904b8c..a8d259090 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -125,7 +125,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq, customize
     )
 
     if customize:
-        prefix += customize
+        prefix += customize + "_"
         config = customization_functions[customize](config)
 
     # Decide tf.distribute.strategy depending on number of available GPUs

From a77e6ef71f3c83076bebd3da76f2a0706378aca1 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 08:11:41 -0700
Subject: [PATCH 097/157] fix import

---
 mlpf/pytorch_delphes/graph_data_delphes.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
index b4e168465..58ba095f6 100644
--- a/mlpf/pytorch_delphes/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -10,7 +10,8 @@
 import pickle
 import multiprocessing
 
-import pytorch_delphes
+import args
+from args import parse_args
 
 # assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
 # they are processed and saved as pt files in /test_tmp_delphes/data/pythia8_ttbar/processed
@@ -144,7 +145,7 @@ def __getitem__(self, idx):
 
 if __name__ == "__main__":
 
-    args = pytorch_delphes.parse_args()
+    args = parse_args()
 
     pfgraphdataset = PFGraphDataset(root=args.dataset)
 

From 7d15a14852814196bc9ea3cc6ce41e55057a312c Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 08:14:25 -0700
Subject: [PATCH 098/157] commenting the optimized code

---
 mlpf/pytorch_delphes/gravnet_optimized.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlpf/pytorch_delphes/gravnet_optimized.py b/mlpf/pytorch_delphes/gravnet_optimized.py
index d0720523c..1d4e17bd2 100644
--- a/mlpf/pytorch_delphes/gravnet_optimized.py
+++ b/mlpf/pytorch_delphes/gravnet_optimized.py
@@ -8,8 +8,11 @@
 from torch_scatter import scatter
 from torch_geometric.nn.conv import MessagePassing
 
+########### (1) clone this repo:  https://github.com/mandylee900125/pytorch_cmspepr.git ###############
+########### (2) do <pip intsall .> from inside ###############
+
 try:
-    from torch_cmspepr import knn_graph ###########remember to do pip intsall .###############
+    from torch_cmspepr import knn_graph
 except ImportError:
     knn_graph = None
 
@@ -18,6 +21,7 @@
 # CHANGED: self.lin -> self.lin_p
 # CHANGED: used reduce='sum' instead of reduce='mean' in the message passing
 # REMOVED: skip connection
+# REPLACED: knn with knn_graph
 
 class GravNetConv_optimized(MessagePassing):
     r"""The GravNet operator from the `"Learning Representations of Irregular
@@ -29,7 +33,7 @@ class GravNetConv_optimized(MessagePassing):
     A second projection of the input feature space is then propagated from the
     neighbors to each vertex using distance weights that are derived by
     applying a Gaussian function to the distances.
-    
+
     Args:
         in_channels (int): The number of input channels.
         out_channels (int): The number of output channels.

From ab80242da25812097a4988c652cb3c15edcb2393 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 08:19:42 -0700
Subject: [PATCH 099/157] bug in the path

---
 scripts/local_test_delphes_pytorch.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index d3ab69d47..8eda5c117 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -42,11 +42,11 @@ python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_qcd
 # before training a model, first get rid of any previous models stored
 rm -Rf experiments/*
 
-cd ../mlpf/pytorch_delphes/
+cd ../mlpf/
 
 #run the pytorch training
 echo Beginning the training..
-python3 pipeline.py \
+python3 pytorch_pipeline.py \
   --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
   --dataset='../../test_tmp_delphes/data/pythia8_ttbar' \
   --dataset_qcd='../../test_tmp_delphes/data/pythia8_qcd' \

From 315ddd3217673fdad1c89c753c40960a656b6101 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 08:30:28 -0700
Subject: [PATCH 100/157] fixed module import in graph_data_delphes

---
 mlpf/pytorch_delphes/graph_data_delphes.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
index 58ba095f6..d8b6662b7 100644
--- a/mlpf/pytorch_delphes/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -10,9 +10,6 @@
 import pickle
 import multiprocessing
 
-import args
-from args import parse_args
-
 # assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
 # they are processed and saved as pt files in /test_tmp_delphes/data/pythia8_ttbar/processed
 # PFGraphDataset -> returns for 1 event: Data(x=[5139, 12], ycand=[5139, 6], ycand_id=[5139, 6], ygen=[5139, 6], ygen_id=[5139, 6])
@@ -143,6 +140,17 @@ def get(self, idx):
     def __getitem__(self, idx):
         return self.get(idx)
 
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", type=str, default='../../../test_tmp_delphes/data/pythia8_ttbar', help="dataset path", required=False)
+    parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
+    parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
+    parser.add_argument("--num-proc", type=int, default=24, help="number of processes")
+    args = parser.parse_args()
+    return args
+
+
 if __name__ == "__main__":
 
     args = parse_args()

From a84a36dfeba5304639ae26ffa0aa6c5e4bf72c27 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 08:34:51 -0700
Subject: [PATCH 101/157] small edit

---
 mlpf/pytorch_delphes/graph_data_delphes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py b/mlpf/pytorch_delphes/graph_data_delphes.py
index d8b6662b7..ae14461f9 100644
--- a/mlpf/pytorch_delphes/graph_data_delphes.py
+++ b/mlpf/pytorch_delphes/graph_data_delphes.py
@@ -143,7 +143,7 @@ def __getitem__(self, idx):
 def parse_args():
     import argparse
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", type=str, default='../../../test_tmp_delphes/data/pythia8_ttbar', help="dataset path", required=False)
+    parser.add_argument("--dataset", type=str, required=True, help="Input data path")
     parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
     parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
     parser.add_argument("--num-proc", type=int, default=24, help="number of processes")

From b1a693bb2287fc61bff54ae46f73ab4f09fcb9bb Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 10:37:21 -0700
Subject: [PATCH 102/157] big cleanup

---
 mlpf/{pytorch_delphes => }/LRP/LRP_clf_gpu.py |  51 +--
 mlpf/{pytorch_delphes => }/LRP/LRP_reg_gpu.py |  62 ++-
 mlpf/LRP/__init__.py                          |   8 +
 mlpf/LRP/args.py                              |  91 ++++
 mlpf/{pytorch_delphes => }/LRP/gravnet_LRP.py |   0
 .../LRP/model_LRP_reg.py => LRP/model_LRP.py} |  13 +-
 mlpf/{pytorch_delphes => }/LRP/model_io.py    |  18 +-
 mlpf/LRP/plots.py                             | 166 +++++++
 mlpf/LRP_pipeline.py                          | 206 +++++++++
 mlpf/pytorch_delphes/LRP/main_reg.py          | 413 ------------------
 mlpf/pytorch_delphes/__init__.py              |   1 -
 mlpf/pytorch_delphes/args.py                  |  60 +--
 mlpf/pytorch_pipeline.py                      | 150 +++----
 13 files changed, 620 insertions(+), 619 deletions(-)
 rename mlpf/{pytorch_delphes => }/LRP/LRP_clf_gpu.py (85%)
 rename mlpf/{pytorch_delphes => }/LRP/LRP_reg_gpu.py (84%)
 create mode 100644 mlpf/LRP/__init__.py
 create mode 100644 mlpf/LRP/args.py
 rename mlpf/{pytorch_delphes => }/LRP/gravnet_LRP.py (100%)
 rename mlpf/{pytorch_delphes/LRP/model_LRP_reg.py => LRP/model_LRP.py} (80%)
 rename mlpf/{pytorch_delphes => }/LRP/model_io.py (91%)
 create mode 100644 mlpf/LRP/plots.py
 create mode 100644 mlpf/LRP_pipeline.py
 delete mode 100644 mlpf/pytorch_delphes/LRP/main_reg.py

diff --git a/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py b/mlpf/LRP/LRP_clf_gpu.py
similarity index 85%
rename from mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
rename to mlpf/LRP/LRP_clf_gpu.py
index 68e63bd5e..a844e5b64 100644
--- a/mlpf/pytorch_delphes/LRP/LRP_clf_gpu.py
+++ b/mlpf/LRP/LRP_clf_gpu.py
@@ -4,7 +4,6 @@
 from torch_scatter import scatter_mean
 import numpy as np
 import json
-import model_io
 from torch_geometric.utils import to_scipy_sparse_matrix
 import scipy
 import pickle, math, time
@@ -17,22 +16,16 @@
 from torch_geometric.utils.convert import to_networkx
 from torch_geometric.utils import to_dense_adj
 
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
+import LRP
 
 class LRP_clf:
     EPSILON=1e-9
 
-    def __init__(self,model:model_io):
+    def __init__(self, device, model:LRP.model_io):
+        self.device=device
         self.model=model
 
-    def register_model(model:model_io):
+    def register_model(model:LRP.model_io):
         self.model=model
 
     """
@@ -41,24 +34,24 @@ def register_model(model:model_io):
 
     # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
     @staticmethod
-    def easy_rule(layer,input,R,index,output_layer,activation_layer, print_statement):
+    def easy_rule(self, layer, input, R, index,output_layer, activation_layer, print_statement):
         EPSILON=1e-9
         # input.retain_grad()
         # z = layer.forward(input)
         # basically layer.forward does this: output=(torch.matmul(input,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
 
         if activation_layer:
-            w = torch.eye(input.shape[1]).to(device)
+            w = torch.eye(input.shape[1]).to(self.device)
         else:
-            w = layer.weight.detach().to(device)
+            w = layer.weight.detach().to(self.device)
 
         if output_layer: # for the output layer
             T, W, r = [], [], []
 
             for i in range(R.shape[1]):
-                T.append(R[:,i].reshape(-1,1).to(device))
-                W.append(w[i,:].reshape(1,-1).to(device))
-                I = torch.ones_like(R[:,i]).reshape(-1,1).to(device)
+                T.append(R[:,i].reshape(-1,1).to(self.device))
+                W.append(w[i,:].reshape(1,-1).to(self.device))
+                I = torch.ones_like(R[:,i]).reshape(-1,1).to(self.device)
 
                 Numerator = (input*torch.matmul(T[i],W[i]))
                 Denominator = (input*torch.matmul(I,W[i])).sum(axis=1)
@@ -83,14 +76,14 @@ def easy_rule(layer,input,R,index,output_layer,activation_layer, print_statement
 
 
     @staticmethod
-    def eps_rule(layer, input, R, index, output_layer, activation_layer, print_statement, adjacency_matrix=None, message_passing=False):
+    def eps_rule(self, layer, input, R, index, output_layer, activation_layer, print_statement, adjacency_matrix=None, message_passing=False):
 
         if activation_layer:
-            w = torch.eye(input.shape[1]).detach().to(device)
+            w = torch.eye(input.shape[1]).detach().to(self.device)
         elif message_passing: # message passing hack
-            w = adjacency_matrix.detach().to(device)
+            w = adjacency_matrix.detach().to(self.device)
         else:
-            w = layer.weight.detach().to(device)
+            w = layer.weight.detach().to(self.device)
 
         wt = torch.transpose(w,0,1)
 
@@ -112,15 +105,15 @@ def eps_rule(layer, input, R, index, output_layer, activation_layer, print_state
                 R_list[output_neuron] = torch.transpose(R_list[output_neuron],0,1)
 
             # rep stands for repeated/expanded
-            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(device)
-            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(device)
+            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(self.device)
+            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(self.device)
 
             H = a_rep*wt_rep
             deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,input.shape[1],-1)
 
             G = H/deno
 
-            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(device)))
+            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(self.device)))
             R_previous[output_neuron] = R_previous[output_neuron].reshape(R_previous[output_neuron].shape[0], R_previous[output_neuron].shape[1]).to('cpu')
 
             if message_passing: # message passing hack
@@ -177,7 +170,7 @@ def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weigh
 
         # modify the big tensor based on message passing rule
         for node_i in tqdm(range(len(big_list))):
-            big_list[node_i] = self.eps_rule(layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, adjacency_matrix=A, message_passing=True)
+            big_list[node_i] = self.eps_rule(self, layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, adjacency_matrix=A, message_passing=True)
             print(f'- Finished computing R-score for node {node_i+1}/{len(big_list)} for the message passing..')
         print('- Finished computing R-scores for the message passing layer')
         return big_list
@@ -241,15 +234,15 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
 
         if len(big_list)==0:  # if you haven't hit the message passing step yet
             if 'Linear' in str(layer):
-                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
             elif 'LeakyReLU' or 'ELU' in str(layer):
-                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
         else:
             for node_i in tqdm(range(len(big_list))):
                 if 'Linear' in str(layer):
-                    big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                    big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
                 elif 'LeakyReLU' or 'ELU' in str(layer):
-                    big_list[node_i] =  self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
+                    big_list[node_i] =  self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
         return R, big_list, output_layer_index
 
 ##-----------------------------------------------------------------------------
diff --git a/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py b/mlpf/LRP/LRP_reg_gpu.py
similarity index 84%
rename from mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
rename to mlpf/LRP/LRP_reg_gpu.py
index 56f997298..a67a5c91a 100644
--- a/mlpf/pytorch_delphes/LRP/LRP_reg_gpu.py
+++ b/mlpf/LRP/LRP_reg_gpu.py
@@ -4,7 +4,6 @@
 from torch_scatter import scatter_mean
 import numpy as np
 import json
-import model_io
 from torch_geometric.utils import to_scipy_sparse_matrix
 import scipy
 import pickle, math, time
@@ -17,22 +16,16 @@
 from torch_geometric.utils.convert import to_networkx
 from torch_geometric.utils import to_dense_adj
 
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
+import LRP
 
 class LRP_reg:
     EPSILON=1e-9
 
-    def __init__(self,model:model_io):
+    def __init__(self, device, model:LRP.model_io):
+        self.device=device
         self.model=model
 
-    def register_model(model:model_io):
+    def register_model(model:LRP.model_io):
         self.model=model
 
     """
@@ -41,24 +34,24 @@ def register_model(model:model_io):
 
     # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
     @staticmethod
-    def easy_rule(layer,input,R,index,output_layer,activation_layer, print_statement, skip_connection=False, adjacency_matrix=False, message_passing=False):
+    def easy_rule(self, layer, input, R, index, output_layer, activation_layer, print_statement, skip_connection=False, adjacency_matrix=False, message_passing=False):
         EPSILON=1e-9
         # input.retain_grad()
         # z = layer.forward(input)
         # basically layer.forward does this: output=(torch.matmul(input,torch.transpose(w,0,1))+b) , assuming the following w & b are retrieved
 
         if activation_layer:
-            w = torch.eye(input.shape[1]).to(device)
+            w = torch.eye(input.shape[1]).to(self.device)
         else:
-            w = layer.weight.detach().to(device)
+            w = layer.weight.detach().to(self.device)
 
         if output_layer: # for the output layer
             T, W, r = [], [], []
 
             for i in range(R.shape[1]):
-                T.append(R[:,i].reshape(-1,1).to(device))
-                W.append(w[i,:].reshape(1,-1).to(device))
-                I = torch.ones_like(R[:,i]).reshape(-1,1).to(device)
+                T.append(R[:,i].reshape(-1,1).to(self.device))
+                W.append(w[i,:].reshape(1,-1).to(self.device))
+                I = torch.ones_like(R[:,i]).reshape(-1,1).to(self.device)
 
                 Numerator = (input*torch.matmul(T[i],W[i]))
                 Denominator = (input*torch.matmul(I,W[i])).sum(axis=1)
@@ -93,14 +86,14 @@ def easy_rule(layer,input,R,index,output_layer,activation_layer, print_statement
 
 
     @staticmethod
-    def eps_rule(layer, input, R, index, output_layer, activation_layer, print_statement, skip_connection=False, adjacency_matrix=None, message_passing=False):
+    def eps_rule(self, layer, input, R, index, output_layer, activation_layer, print_statement, skip_connection=False, adjacency_matrix=None, message_passing=False):
 
         if activation_layer:
-            w = torch.eye(input.shape[1]).detach().to(device)
+            w = torch.eye(input.shape[1]).detach().to(self.device)
         elif message_passing: # message passing hack
-            w = adjacency_matrix.detach().to(device)
+            w = adjacency_matrix.detach().to(self.device)
         else:
-            w = layer.weight.detach().to(device)
+            w = layer.weight.detach().to(self.device)
 
         wt = torch.transpose(w,0,1)
 
@@ -122,15 +115,15 @@ def eps_rule(layer, input, R, index, output_layer, activation_layer, print_state
                 R_list[output_neuron] = torch.transpose(R_list[output_neuron],0,1)
 
             # rep stands for repeated/expanded
-            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(device)
-            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(device)
+            a_rep = input.reshape(input.shape[0],input.shape[1],1).expand(-1,-1,R_list[output_neuron].shape[1]).to(self.device)
+            wt_rep = Wt[output_neuron].reshape(1,Wt[output_neuron].shape[0],Wt[output_neuron].shape[1]).expand(input.shape[0],-1,-1).to(self.device)
 
             H = a_rep*wt_rep
             deno = H.sum(axis=1).reshape(H.sum(axis=1).shape[0],1,H.sum(axis=1).shape[1]).expand(-1,input.shape[1],-1)
 
             G = H/deno
 
-            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(device)))
+            R_previous[output_neuron] = (torch.matmul(G, R_list[output_neuron].reshape(R_list[output_neuron].shape[0],R_list[output_neuron].shape[1],1).to(self.device)))
             R_previous[output_neuron] = R_previous[output_neuron].reshape(R_previous[output_neuron].shape[0], R_previous[output_neuron].shape[1]).to('cpu')
 
             if message_passing: # message passing hack
@@ -196,7 +189,7 @@ def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weigh
 
         # modify the big tensor based on message passing rule
         for node_i in tqdm(range(len(big_list))):
-            big_list[node_i] = self.eps_rule(layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, skip_connection=False, adjacency_matrix=A, message_passing=True)
+            big_list[node_i] = self.eps_rule(self, layer, torch.transpose(before_message,0,1), big_list[node_i], index, output_layer=False, activation_layer=False, print_statement=True, skip_connection=False, adjacency_matrix=A, message_passing=True)
             print(f'- Finished computing R-score for node {node_i+1}/{len(big_list)} for the message passing..')
         print('- Finished computing R-scores for the message passing layer')
         return big_list
@@ -211,7 +204,6 @@ def explain(self,
                 save:bool=True,
                 save_to:str="./relevance.pt",
                 sort_nodes_by:int=0,
-                signal=torch.tensor([1,0,0,0,0,0],dtype=torch.float32).to(device),
                 return_result:bool=False):
 
         start_index = self.model.n_layers                  ##########################
@@ -252,7 +244,7 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
         # (1) for skip connection purposes
         if 'nn3.0' in str(name):
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer} - Skip connection")
-            input_relevance, pid_relevance, embedding_relevance = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True, skip_connection=True)
+            input_relevance, pid_relevance, embedding_relevance = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True, skip_connection=True)
 
             torch.save(input_relevance, outpath + f'/LRP/input_relevance.pt')
             torch.save(embedding_relevance, outpath + f'/LRP/embedding_relevance.pt')
@@ -262,10 +254,10 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
         # (2) for skip connection purposes
         if 'nn2.0' in str(name):
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
-            R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+            R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
 
             # add the embedding_relevance computed in the nn3.0 skip connection
-            embedding_relevance = torch.load(outpath + f'/LRP/embedding_relevance.pt', map_location=torch.device('cpu'))
+            embedding_relevance = torch.load(outpath + f'/LRP/embedding_relevance.pt', map_location=torch.self.device('cpu'))
 
             for i in range(len(R)):
                 R[i] = R[i] + embedding_relevance[i]
@@ -277,10 +269,10 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
 
             # add the input_relevance computed in the nn3.0 skip connection
-            input_relevance = torch.load(outpath + f'/LRP/input_relevance.pt', map_location=torch.device('cpu'))
+            input_relevance = torch.load(outpath + f'/LRP/input_relevance.pt', map_location=torch.self.device('cpu'))
 
             for node_i in tqdm(range(len(big_list))):
-                big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
                 for i in range(len(R)):
                     # for row in range(len(big_list[node_i][i])):
                     #     # check if row is nonzero
@@ -301,16 +293,16 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
 
         if len(big_list)==0:  # if you haven't hit the message passing step yet
             if 'Linear' in str(layer):
-                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
             elif 'LeakyReLU' or 'ELU' in str(layer):
-                R = self.eps_rule(layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
+                R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=True, print_statement=True)
         else:
             # in this way: big_list is a list of length 5k (nodes) that contains a list of length 6 (output_neurons) that contains tensors (5k,x) which are the heatmap of R-scores
             for node_i in tqdm(range(len(big_list))):
                 if 'Linear' in str(layer):
-                    big_list[node_i] = self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
+                    big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
                 elif 'LeakyReLU' or 'ELU' in str(layer):
-                    big_list[node_i] =  self.eps_rule(layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
+                    big_list[node_i] =  self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=True, print_statement=False)
         return R, big_list
 
 ##-----------------------------------------------------------------------------
diff --git a/mlpf/LRP/__init__.py b/mlpf/LRP/__init__.py
new file mode 100644
index 000000000..f8e5e19ee
--- /dev/null
+++ b/mlpf/LRP/__init__.py
@@ -0,0 +1,8 @@
+from LRP.args import parse_args
+from LRP.plots import make_heatmaps
+from LRP.model_io import model_io
+from LRP.model_LRP import PFNet7
+from LRP.gravnet_LRP import GravNetConv
+
+from LRP.LRP_clf_gpu import LRP_clf
+from LRP.LRP_reg_gpu import LRP_reg
diff --git a/mlpf/LRP/args.py b/mlpf/LRP/args.py
new file mode 100644
index 000000000..8840fec67
--- /dev/null
+++ b/mlpf/LRP/args.py
@@ -0,0 +1,91 @@
+import argparse
+from math import inf
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--LRP_dataset_qcd", type=str, default='../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=True)
+    parser.add_argument("--LRP_outpath", type=str, default = '../test_tmp_delphes/experiments/LRP/', help="Output folder for the LRP relevance scores and heatmaps", required=True)
+
+    # usual specs
+    parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing LRP.. each file contains 100 events")
+    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
+
+    parser.add_argument("--hidden_dim", type=int, default=256, help="hidden dimension")
+    parser.add_argument("--hidden_dim_nn1", type=int, default=64, help="hidden dimension")
+    parser.add_argument("--input_encoding", type=int, default=12, help="use an input encoding layer")
+    parser.add_argument("--encoding_dim", type=int, default=64, help="encoded element dimension")
+    parser.add_argument("--space_dim", type=int, default=4, help="Spatial dimension for clustering in gravnet layer")
+    parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
+    parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
+
+    # extras for LRP
+    parser.add_argument("--explain", action=BoolArg, default=True, help="General setup mode: if True then you want to explain.. if False then you will load an already explained model (already made R-scores)..")
+    parser.add_argument("--LRP_reg", action=BoolArg, default=True, help="Works only if --explain is True.. Runs LRP for interpreting the regression part..")
+    parser.add_argument("--LRP_clf", action=BoolArg, default=True, help="Works only if --explain is True.. Runs LRP for interpreting the classification part..")
+
+    parser.add_argument("--LRP_load_model", type=str, default="/PFNet7_cand_ntrain_2", help="Loads the model to explain (or only make_heatmaps)", required=False)
+    parser.add_argument("--LRP_load_epoch", type=int, default=0, help="Loads the epoch after which to explain (or only make_heatmaps)")
+
+    parser.add_argument("--make_heatmaps_reg", action=BoolArg, default=True, help="Constructs heatmaps for the regressed p4 (must run with explain=True or else you load a pre-explained model with explain=False)..")
+    parser.add_argument("--make_heatmaps_clf", action=BoolArg, default=True, help="Constructs heatmaps for the classified pid (must run with explain=True or else you load a pre-explained model with explain=False)..")
+
+    args = parser.parse_args()
+
+    return args
+
+
+class BoolArg(argparse.Action):
+    """
+    Take an argparse argument that is either a boolean or a string and return a boolean.
+    """
+    def __init__(self, default=None, nargs=None, *args, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+
+        # Set default
+        if default is None:
+            raise ValueError("Default must be set!")
+
+        default = _arg_to_bool(default)
+
+        super().__init__(*args, default=default, nargs='?', **kwargs)
+
+    def __call__(self, parser, namespace, argstring, option_string):
+
+        if argstring is not None:
+            # If called with an argument, convert to bool
+            argval = _arg_to_bool(argstring)
+        else:
+            # BoolArg will invert default option
+            argval = True
+
+        setattr(namespace, self.dest, argval)
+
+def _arg_to_bool(arg):
+    # Convert argument to boolean
+
+    if type(arg) is bool:
+        # If argument is bool, just return it
+        return arg
+
+    elif type(arg) is str:
+        # If string, convert to true/false
+        arg = arg.lower()
+        if arg in ['true', 't', '1']:
+            return True
+        elif arg in ['false', 'f', '0']:
+            return False
+        else:
+            return ValueError('Could not parse a True/False boolean')
+    else:
+        raise ValueError('Input must be boolean or string! {}'.format(type(arg)))
+
+
+# From https://stackoverflow.com/questions/12116685/how-can-i-require-my-python-scripts-argument-to-be-a-float-between-0-0-1-0-usin
+class Range(object):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+    def __eq__(self, other):
+        return self.start <= other <= self.end
diff --git a/mlpf/pytorch_delphes/LRP/gravnet_LRP.py b/mlpf/LRP/gravnet_LRP.py
similarity index 100%
rename from mlpf/pytorch_delphes/LRP/gravnet_LRP.py
rename to mlpf/LRP/gravnet_LRP.py
diff --git a/mlpf/pytorch_delphes/LRP/model_LRP_reg.py b/mlpf/LRP/model_LRP.py
similarity index 80%
rename from mlpf/pytorch_delphes/LRP/model_LRP_reg.py
rename to mlpf/LRP/model_LRP.py
index b44b3255f..af6293f44 100644
--- a/mlpf/pytorch_delphes/LRP/model_LRP_reg.py
+++ b/mlpf/LRP/model_LRP.py
@@ -10,14 +10,8 @@
 from torch_geometric.nn import EdgeConv, MessagePassing, EdgePooling, GATConv, GCNConv, JumpingKnowledge, GraphUNet, DynamicEdgeConv, DenseGCNConv
 from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
 from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch.utils.data import random_split
 
-#from torch_geometric.nn import GravNetConv         # if you want to get it from source code (won't be able to retrieve the adjacency matrix)
-from gravnet_LRP import GravNetConv
-from torch_geometric.nn import GraphConv
+import LRP
 
 #Model with gravnet clustering
 class PFNet7(nn.Module):
@@ -25,7 +19,8 @@ def __init__(self,
         input_dim=12, hidden_dim=256, hidden_dim_nn1=64, input_encoding=12, encoding_dim=64,
         output_dim_id=6,
         output_dim_p4=6,
-        space_dim=4, propagate_dimensions=22, nearest=16):
+        space_dim=4, propagate_dimensions=22, nearest=16,
+        target="gen", nn1=True, nn3=True):
 
         super(PFNet7, self).__init__()
 
@@ -43,7 +38,7 @@ def __init__(self,
         )
 
         # (2) CNN: Gravnet layer
-        self.conv1 = GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+        self.conv1 = LRP.GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
 
         # (3) DNN layer: classifying PID
         self.nn2 = nn.Sequential(
diff --git a/mlpf/pytorch_delphes/LRP/model_io.py b/mlpf/LRP/model_io.py
similarity index 91%
rename from mlpf/pytorch_delphes/LRP/model_io.py
rename to mlpf/LRP/model_io.py
index c3b2cf15b..bdbe73123 100644
--- a/mlpf/pytorch_delphes/LRP/model_io.py
+++ b/mlpf/LRP/model_io.py
@@ -5,15 +5,6 @@
 import numpy as np
 import json
 
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-else:
-    device = torch.device('cpu')
-
 class model_io:
     SPECIAL_LAYERS=[
         ".nn2.0",
@@ -21,10 +12,11 @@ class model_io:
         # ".conv1.lin_p"
     ]
 
-    def __init__(self,model,
+    def __init__(self,device,model,
                 model_state_dict,
                 activation_dest, dic):
 
+        self.device=device
         self.model=model
         self.model.load_state_dict(model_state_dict)
         self.dest=activation_dest
@@ -81,7 +73,7 @@ def get_rule(self,index=None,layer_name=None):
         else:
             self._register_rules()
             return self._rules[layer_name]
-            
+
     """
     layer functions
     """
@@ -145,7 +137,7 @@ def copy_layer(layer):
     layer_cp=eval("nn."+layer.__repr__())
     layer_cp.load_state_dict(layer.state_dict())
 
-    return layer_cp.to(device)
+    return layer_cp.to(self.device)
 
 def copy_tensor(tensor,dtype=torch.float32):
     """
@@ -153,4 +145,4 @@ def copy_tensor(tensor,dtype=torch.float32):
     outputs the copy with specified dtype
     """
 
-    return tensor.clone().detach().requires_grad_(True).to(device)
+    return tensor.clone().detach().requires_grad_(True).to(self.device)
diff --git a/mlpf/LRP/plots.py b/mlpf/LRP/plots.py
new file mode 100644
index 000000000..406db618b
--- /dev/null
+++ b/mlpf/LRP/plots.py
@@ -0,0 +1,166 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import torch
+
+def map_index_to_pid(id):
+    if id==0:
+        return 'null'
+    if id==1:
+        return 'charged hadron'
+    if id==2:
+        return 'neutral hadron'
+    if id==3:
+        return 'photon'
+    if id==4:
+        return 'electron'
+    if id==5:
+        return 'muon'
+
+def map_index_to_p4(index):
+    if index==0:
+        return 'charge'
+    if index==1:
+        return 'pt'
+    if index==2:
+        return 'eta'
+    if index==3:
+        return 'sin phi'
+    if index==4:
+        return 'cos phi'
+    if index==5:
+        return 'energy'
+
+def make_heatmaps(big_list, to_explain, device, outpath, output_dim_id, output_dim_p4, task):
+
+    print(f'Making heatmaps for {task}..')
+
+    X = to_explain["inputs"]
+    gen_ids_one_hot = to_explain["gen_id"]
+    pred_ids_one_hot = to_explain["pred_id"]
+
+    gen_ids = gen_ids_one_hot.argmax(axis=1)
+    pred_ids = pred_ids_one_hot.argmax(axis=1)
+
+    # make directories to hold the heatmaps
+    for i in range(6):
+        if not osp.isdir(outpath + '/LRP'):
+            os.makedirs(outpath + '/LRP')
+        if not osp.isdir(outpath + f'/LRP/class{str(i)}'):
+            os.makedirs(outpath + f'/LRP/class{str(i)}')
+        for j in range(6):
+            if task=='regression':
+                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}'):
+                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}')
+            elif task=='classification':
+                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}')
+
+    # attempt to break down big_list onto 6 smaller lists, 1 for each pid
+    list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
+    dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
+
+    for node_i in range(len(big_list)):  # iterate over the nodes
+
+        if gen_ids[node_i]==0:  # if it's a null then add it to the null list
+            list0.append(big_list[node_i])
+            dist0.append(node_i)
+        if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
+            list1.append(big_list[node_i])
+            dist1.append(node_i)
+        if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
+            list2.append(big_list[node_i])
+            dist2.append(node_i)
+        if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
+            list3.append(big_list[node_i])
+            dist3.append(node_i)
+        if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
+            list4.append(big_list[node_i])
+            dist4.append(node_i)
+        if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
+            list5.append(big_list[node_i])
+            dist5.append(node_i)
+
+    list = [list0,list1,list2,list3,list4,list5]
+    dist = [dist0,dist1,dist2,dist3,dist4,dist5]
+
+    if task=='regression':
+        output_dim = output_dim_p4
+    elif task=='classification':
+        output_dim = output_dim_id
+
+    for pid in range(output_dim_id):
+        if pid!=1:
+            continue
+        for node_i in range(len(list[pid])): # iterate over the nodes in each list
+            print('- making heatmap for', map_index_to_pid(pid), 'node #:', node_i+1, '/', len(list[pid]))
+            for output_neuron in range(output_dim):
+                R_cat_feat = torch.cat([list[pid][node_i][output_neuron].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
+
+                non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
+                R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
+                pos = dist[pid][node_i]
+                probability = pred_ids_one_hot[pos]
+
+                def get_type(t):
+                    l = []
+                    for elem in t:
+                        if elem==1:
+                            l.append('cluster')
+                        if elem==2:
+                            l.append('track')
+                    return l
+
+                node_types = get_type(R_cat_feat_msk[:,12])
+
+                fig, ax = plt.subplots()
+                fig.tight_layout()
+
+                if task=='regression':
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                elif task=='classification':
+                    if (torch.argmax(probability)==pid):
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
+                    else:
+                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
+
+                ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
+                if pid==1:
+                    features = ["type", " pt", "eta",
+                           "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
+                else:
+                    features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
+
+                ax.set_xticks(np.arange(len(features)))
+                ax.set_yticks(np.arange(len(node_types)))
+                for col in range(len(features)):
+                    for row in range(len(node_types)):
+                        text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
+                                       ha="center", va="center", color="w")
+                # ... and label them with the respective list entries
+                ax.set_xticklabels(features)
+                ax.set_yticklabels(node_types)
+                plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
+                plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
+                plt.colorbar()
+                fig.set_size_inches(12, 12)
+                if task=='regression':
+                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                elif task=='classification':
+                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                plt.close(fig)
diff --git a/mlpf/LRP_pipeline.py b/mlpf/LRP_pipeline.py
new file mode 100644
index 000000000..73cf8abf1
--- /dev/null
+++ b/mlpf/LRP_pipeline.py
@@ -0,0 +1,206 @@
+from glob import glob
+import sys, os
+import os.path as osp
+import pickle as pkl
+import _pickle as cPickle
+import math, time, tqdm
+import numpy as np
+import pandas as pd
+import sklearn
+from sklearn.metrics import accuracy_score, confusion_matrix
+import matplotlib, mplhep
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+#Check if the GPU configuration has been provided
+import torch
+use_gpu = torch.cuda.device_count()>0
+multi_gpu = torch.cuda.device_count()>1
+
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        import setGPU
+        if multi_gpu:
+            print('Will use multi_gpu..')
+            print("Let's use", torch.cuda.device_count(), "GPUs!")
+        else:
+            print('Will use single_gpu..')
+except Exception as e:
+    print("Could not import setGPU, running CPU-only")
+
+#define the global base device
+if use_gpu:
+    device = torch.device('cuda:0')
+    print("GPU model:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device('cpu')
+
+import torch_geometric
+import torch.nn as nn
+
+from pytorch_delphes import PFGraphDataset, data_to_loader_ttbar, data_to_loader_qcd
+from LRP import parse_args, make_heatmaps, model_io, PFNet7, LRP_clf, LRP_reg
+
+# NOTE: this script works by loading an already trained model with very specefic specs
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # # the next part initializes some args values (to run the script not from terminal)
+    # class objectview(object):
+    #     def __init__(self, d):
+    #         self.__dict__ = d
+    #
+    # args = objectview({'n_test': 2, 'batch_size': 1,' hidden_dim':256, 'hidden_dim_nn1': 64,
+    # 'input_encoding': 12, 'encoding_dim': 64, 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16,
+    # 'LRP_dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'LRP_dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    # 'LRP_outpath': '../test_tmp_delphes/experiments/LRP/',
+    # 'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'explain': False, 'LRP_clf': False, 'LRP_reg': False,
+    # 'make_heatmaps_clf': True,'make_heatmaps_reg': True})
+
+    # define the dataset (assumes the data exists as .pt files in "processed")
+    print('Processing the data..')
+    full_dataset_qcd = PFGraphDataset(args.LRP_dataset_qcd)
+
+    # constructs a loader from the data to iterate over batches
+    print('Constructing data loader..')
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+
+    # element parameters
+    input_dim = 12
+
+    #one-hot particle ID and momentum
+    output_dim_id = 6
+    output_dim_p4 = 6
+
+    outpath = args.LRP_outpath + args.LRP_load_model
+    PATH = outpath + '/epoch_' + str(args.LRP_load_epoch) + '_weights.pth'
+
+    # loading the model
+    print('Loading a previously trained model..')
+    with open(outpath + '/model_kwargs.pkl', 'rb') as f:
+        model_kwargs = pkl.load(f)
+
+    model = PFNet7(**model_kwargs)
+
+    state_dict = torch.load(PATH, map_location=device)
+
+    # if model was trained using DataParallel then we have to load it differently
+    if "DataParallel" in args.LRP_load_model:
+        state_dict = torch.load(PATH, map_location=device)
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:] # remove module.
+            new_state_dict[name] = v
+            # print('name is:', name)
+        state_dict=new_state_dict
+
+    model.load_state_dict(state_dict)
+    model.to(device)
+
+    if args.explain:
+        model.eval()
+        print(model)
+
+        # create some hooks to retrieve intermediate activations
+        activation = {}
+        hooks={}
+
+        def get_activation(name):
+            def hook(model, input, output):
+                activation[name] = input[0]
+            return hook
+
+        for name, module in model.named_modules():
+            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
+                hooks[name] = module.register_forward_hook(get_activation("." + name))
+
+        for i, batch in enumerate(test_loader):
+
+            if multi_gpu:
+                X = batch
+            else:
+                X = batch.to(device)
+
+            if i==0:
+                # code can be written better
+                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
+                model = model_io(device, model, state_dict, dict(), activation)
+                explainer_reg = LRP_reg(device, model)
+                explainer_clf = LRP_clf(device, model)
+
+            else:
+                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
+
+            if not osp.isdir(outpath + '/LRP'):
+                os.makedirs(outpath + '/LRP')
+
+            if args.LRP_reg:
+                print('Explaining the p4 predictions:')
+                to_explain_reg = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
+                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
+                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
+                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
+
+                model.set_dest(to_explain_reg["A"])
+
+                big_list_reg = explainer_reg.explain(to_explain_reg)
+                torch.save(big_list_reg, outpath + '/LRP/big_list_reg.pt')
+                torch.save(to_explain_reg, outpath + '/LRP/to_explain_reg.pt')
+
+            if args.LRP_clf:
+                print('Explaining the pid predictions:')
+                to_explain_clf = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
+                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
+                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
+                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
+                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
+
+                model.set_dest(to_explain_clf["A"])
+
+                big_list_clf = explainer_clf.explain(to_explain_clf)
+
+                torch.save(big_list_clf, outpath + '/LRP/big_list_clf.pt')
+                torch.save(to_explain_clf, outpath + '/LRP/to_explain_clf.pt')
+
+            break # explain only one single event
+
+    if args.make_heatmaps_reg:
+        # load the necessary R-scores
+        big_list_reg = torch.load(outpath + '/LRP/big_list_reg.pt', map_location=device)
+        to_explain_reg = torch.load(outpath + '/LRP/to_explain_reg.pt', map_location=device)
+
+        make_heatmaps(big_list_reg, to_explain_reg, device, outpath, output_dim_id, output_dim_p4, 'regression')
+
+    if args.make_heatmaps_clf:
+        # load the necessary R-scores
+        big_list_clf = torch.load(outpath + '/LRP/big_list_clf.pt', map_location=device)
+        to_explain_clf = torch.load(outpath + '/LRP/to_explain_clf.pt', map_location=device)
+
+        make_heatmaps(big_list_clf, to_explain_clf, device, outpath, output_dim_id, output_dim_p4, 'classification')
+
+# # ------------------------------------------------------------------------------------------------
+# # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:
+# print(R16[0].sum(axis=1)[0])
+# print(R15[0].sum(axis=1)[0])
+# print(R14[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R13[0].sum(axis=1)[0])
+# print(R12[0].sum(axis=1)[0])
+# print(R11[0].sum(axis=1)[0])
+# print(R10[0].sum(axis=1)[0])
+# print(R9[0].sum(axis=1)[0])
+# print(R8[0].sum(axis=1)[0])
+# print(R_score_layer_before_msg_passing[0][0].sum(axis=0).sum())
+# print(R7[0][0].sum(axis=0).sum())
+# print(R6[0][0].sum(axis=1).sum())
+# print(R5[0][0].sum(axis=1).sum())
+# print(R4[0][0].sum(axis=1).sum())
+# print(R3[0][0].sum(axis=1).sum())
+# print(R2[0][0].sum(axis=1).sum())
+# print(R1[0][0].sum(axis=1).sum())
diff --git a/mlpf/pytorch_delphes/LRP/main_reg.py b/mlpf/pytorch_delphes/LRP/main_reg.py
deleted file mode 100644
index 06ae33e82..000000000
--- a/mlpf/pytorch_delphes/LRP/main_reg.py
+++ /dev/null
@@ -1,413 +0,0 @@
-from glob import glob
-import sys, os
-import os.path as osp
-import pickle as pkl
-import _pickle as cPickle
-import math, time, tqdm
-import numpy as np
-import pandas as pd
-import sklearn
-from sklearn.metrics import accuracy_score, confusion_matrix
-import matplotlib, mplhep
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-
-#Check if the GPU configuration has been provided
-import torch
-use_gpu = torch.cuda.device_count()>0
-multi_gpu = torch.cuda.device_count()>1
-
-try:
-    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
-        import setGPU
-        if multi_gpu:
-            print('Will use multi_gpu..')
-            print("Let's use", torch.cuda.device_count(), "GPUs!")
-        else:
-            print('Will use single_gpu..')
-except Exception as e:
-    print("Could not import setGPU, running CPU-only")
-
-#define the global base device
-if use_gpu:
-    device = torch.device('cuda:0')
-    print("GPU model:", torch.cuda.get_device_name(0))
-else:
-    device = torch.device('cpu')
-
-import torch_geometric
-import torch.nn as nn
-import torch.nn.functional as F
-import torch_geometric.transforms as T
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from torch_scatter import scatter_mean
-from torch_geometric.nn.inits import reset
-from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
-from torch_geometric.nn import GravNetConv
-from torch.utils.data import random_split
-import torch_cluster
-import networkx as nx
-from torch_geometric.utils.convert import to_networkx
-from torch_geometric.utils import to_dense_adj
-
-sys.path.insert(1, '../')
-sys.path.insert(1, '../../../plotting/')
-sys.path.insert(1, '../../../mlpf/plotting/')
-
-import args
-from args import parse_args
-from graph_data_delphes import PFGraphDataset, one_hot_embedding
-from data_preprocessing import data_to_loader_ttbar, data_to_loader_qcd
-
-from model_LRP_reg import PFNet7
-from LRP_clf_gpu import LRP_clf
-from LRP_reg_gpu import LRP_reg
-
-from model_io import model_io
-
-# NOTE: this script works by loading an already trained model
-
-#Get a unique directory name for the model
-def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_size, task, title):
-    model_name = type(model).__name__
-    model_params = sum(p.numel() for p in model.parameters())
-    import hashlib
-    model_cfghash = hashlib.blake2b(repr(model).encode()).hexdigest()[:10]
-    model_user = os.environ['USER']
-
-    model_fname = '{}_{}_ntrain_{}_nepochs_{}_batch_size_{}_lr_{}_{}'.format(
-        model_name,
-        target_type,
-        n_train,
-        n_epochs,
-        batch_size,
-        lr,
-        task,
-        title)
-    return model_fname
-
-def map_index_to_pid(id):
-    if id==0:
-        return 'null'
-    if id==1:
-        return 'charged hadron'
-    if id==2:
-        return 'neutral hadron'
-    if id==3:
-        return 'photon'
-    if id==4:
-        return 'electron'
-    if id==5:
-        return 'muon'
-
-def map_index_to_p4(index):
-    if index==0:
-        return 'charge'
-    if index==1:
-        return 'pt'
-    if index==2:
-        return 'eta'
-    if index==3:
-        return 'sin phi'
-    if index==4:
-        return 'cos phi'
-    if index==5:
-        return 'energy'
-
-def make_heatmaps(big_list, to_explain, task):
-
-    print(f'Making heatmaps for {task}..')
-
-    X = to_explain["inputs"]
-    gen_ids_one_hot = to_explain["gen_id"]
-    pred_ids_one_hot = to_explain["pred_id"]
-
-    gen_ids = gen_ids_one_hot.argmax(axis=1)
-    pred_ids = pred_ids_one_hot.argmax(axis=1)
-
-    # make directories to hold the heatmaps
-    for i in range(6):
-        if not osp.isdir(outpath + '/LRP'):
-            os.makedirs(outpath + '/LRP')
-        if not osp.isdir(outpath + f'/LRP/class{str(i)}'):
-            os.makedirs(outpath + f'/LRP/class{str(i)}')
-        for j in range(6):
-            if task=='regression':
-                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}'):
-                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}')
-            elif task=='classification':
-                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}'):
-                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}')
-
-    # attempt to break down big_list onto 6 smaller lists, 1 for each pid
-    list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
-    dist0, dist1, dist2, dist3, dist4, dist5 = [], [], [], [], [], []
-
-    for node_i in range(len(big_list)):  # iterate over the nodes
-
-        if gen_ids[node_i]==0:  # if it's a null then add it to the null list
-            list0.append(big_list[node_i])
-            dist0.append(node_i)
-        if gen_ids[node_i]==1:  # if it's a chhadron then add it to the chhadron list
-            list1.append(big_list[node_i])
-            dist1.append(node_i)
-        if gen_ids[node_i]==2:  # if it's a nhadron then add it to the nhadron list
-            list2.append(big_list[node_i])
-            dist2.append(node_i)
-        if gen_ids[node_i]==3:  # if it's a photon then add it to the photon list
-            list3.append(big_list[node_i])
-            dist3.append(node_i)
-        if gen_ids[node_i]==4:  # if it's a electron then add it to the electron list
-            list4.append(big_list[node_i])
-            dist4.append(node_i)
-        if gen_ids[node_i]==5:  # if it's a muon then add it to the muon list
-            list5.append(big_list[node_i])
-            dist5.append(node_i)
-
-    list = [list0,list1,list2,list3,list4,list5]
-    dist = [dist0,dist1,dist2,dist3,dist4,dist5]
-
-    if task=='regression':
-        output_dim = output_dim_p4
-    elif task=='classification':
-        output_dim = output_dim_id
-
-    for pid in range(output_dim_id):
-        if pid!=1:
-            continue
-        for node_i in range(len(list[pid])): # iterate over the nodes in each list
-            print('- making heatmap for', map_index_to_pid(pid), 'node #:', node_i+1, '/', len(list[pid]))
-            for output_neuron in range(output_dim):
-                R_cat_feat = torch.cat([list[pid][node_i][output_neuron].to(device), X['x'].to(device), torch.arange(start=0, end=X['x'].shape[0], step=1).float().reshape(-1,1).to(device)], dim=1)
-
-                non_empty_mask = R_cat_feat[:,:12].abs().sum(dim=1).bool()
-                R_cat_feat_msk = R_cat_feat[non_empty_mask,:]   # R_cat_feat masked (non-zero)
-                pos = dist[pid][node_i]
-                probability = pred_ids_one_hot[pos]
-
-                def get_type(t):
-                    l = []
-                    for elem in t:
-                        if elem==1:
-                            l.append('cluster')
-                        if elem==2:
-                            l.append('track')
-                    return l
-
-                node_types = get_type(R_cat_feat_msk[:,12])
-
-                fig, ax = plt.subplots()
-                fig.tight_layout()
-
-                if task=='regression':
-                    if (torch.argmax(probability)==pid):
-                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
-                    else:
-                        ax.set_title('Heatmap for the "'+map_index_to_p4(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
-
-                elif task=='classification':
-                    if (torch.argmax(probability)==pid):
-                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of a correctly classified ' + map_index_to_pid(pid))
-                    else:
-                        ax.set_title('Heatmap for the "'+map_index_to_pid(output_neuron)+'" prediction of an incorrectly classified ' + map_index_to_pid(pid))
-
-                ### TODO: Not the best way to do it.. I am assuming here that only charged hadrons are connected to all tracks
-                if pid==1:
-                    features = ["type", " pt", "eta",
-                           "sphi", "cphi", "E", "eta_out", "sphi_out", "cphi_out", "charge", "is_gen_mu", "is_gen_el"]
-                else:
-                    features = ["type", "Et", "eta", "sphi", "cphi", "E", "Eem", "Ehad", "pad", "pad", "pad", "pad"]
-
-                ax.set_xticks(np.arange(len(features)))
-                ax.set_yticks(np.arange(len(node_types)))
-                for col in range(len(features)):
-                    for row in range(len(node_types)):
-                        text = ax.text(col, row, round(R_cat_feat_msk[row,12+col].item(),2),
-                                       ha="center", va="center", color="w")
-                # ... and label them with the respective list entries
-                ax.set_xticklabels(features)
-                ax.set_yticklabels(node_types)
-                plt.xlabel("\nposition of node is row # {pos} from the top \n class prediction: {R} \n where prob = [null, chhadron, nhadron, photon, electron, muon]".format(R=[round(num,2) for num in probability.detach().tolist()], pos=((R_cat_feat_msk[:,-1] == pos).nonzero(as_tuple=True)[0].item()+1)))
-                plt.imshow(torch.abs(R_cat_feat_msk[:,:12]).detach().cpu().numpy(), interpolation="nearest", cmap='copper', aspect='auto')
-                plt.colorbar()
-                fig.set_size_inches(12, 12)
-                if task=='regression':
-                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
-                elif task=='classification':
-                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
-                plt.close(fig)
-
-
-if __name__ == "__main__":
-
-    # args = parse_args()
-
-    # the next part initializes some args values (to run the script not from terminal)
-    class objectview(object):
-        def __init__(self, d):
-            self.__dict__ = d
-
-    args = objectview({'n_train': 1, 'n_valid': 1, 'n_test': 2, 'n_epochs': 2, 'patience': 100, 'hidden_dim':256, 'input_encoding': 12, 'encoding_dim': 64,
-    'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'LRP_dataset': '../../../test_tmp_delphes/data/pythia8_ttbar', 'LRP_dataset_qcd': '../../../test_tmp_delphes/data/pythia8_qcd',
-    'LRP_outpath': '../../../prp/models/LRP/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 1, 'dropout': 0,
-    'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16, 'overwrite': True,
-    'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    'explain': False, 'make_heatmaps_clf': True,'make_heatmaps_reg': False,
-    'clf': True, 'reg': False})
-
-    # define the dataset (assumes the data exists as .pt files in "processed")
-    print('Processing the data..')
-    full_dataset_ttbar = PFGraphDataset(args.LRP_dataset)
-    full_dataset_qcd = PFGraphDataset(args.LRP_dataset_qcd)
-
-    # constructs a loader from the data to iterate over batches
-    print('Constructing data loaders..')
-    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
-
-    # element parameters
-    input_dim = 12
-
-    #one-hot particle ID and momentum
-    output_dim_id = 6
-    output_dim_p4 = 6
-
-    patience = args.patience
-
-    model_classes = {"PFNet7": PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'space_dim': args.space_dim,
-                    'propagate_dimensions': args.propagate_dimensions,
-                    'nearest': args.nearest}
-
-    print('Loading a previously trained model..')
-    model = model_class(**model_kwargs)
-    outpath = args.LRP_outpath + args.LRP_load_model
-    PATH = outpath + '/epoch_' + str(args.LRP_load_epoch) + '_weights.pth'
-
-    state_dict = torch.load(PATH, map_location=device)
-
-    # if model was trained using DataParallel then we have to load it differently
-    if "DataParallel" in args.LRP_load_model:
-        state_dict = torch.load(PATH, map_location=device)
-        from collections import OrderedDict
-        new_state_dict = OrderedDict()
-        for k, v in state_dict.items():
-            name = k[7:] # remove module.
-            new_state_dict[name] = v
-            # print('name is:', name)
-        state_dict=new_state_dict
-
-    model.load_state_dict(state_dict)
-    model.to(device)
-
-    if args.explain:
-        model.eval()
-        print(model)
-
-        # create some hooks to retrieve intermediate activations
-        activation = {}
-        hooks={}
-
-        def get_activation(name):
-            def hook(model, input, output):
-                activation[name] = input[0]
-            return hook
-
-        for name, module in model.named_modules():
-            if (type(module)==nn.Linear) or (type(module)==nn.LeakyReLU) or (type(module)==nn.ELU):
-                hooks[name] = module.register_forward_hook(get_activation("." + name))
-
-        for i, batch in enumerate(train_loader):
-
-            if multi_gpu:
-                X = batch
-            else:
-                X = batch.to(device)
-
-            if i==0:
-                # code can be written better
-                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
-                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
-                model = model_io(model,state_dict,dict(),activation)
-                explainer_reg = LRP_reg(model)
-                explainer_clf = LRP_clf(model)
-
-            else:
-                pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
-
-            if not osp.isdir(outpath + '/LRP'):
-                os.makedirs(outpath + '/LRP')
-
-            if args.LRP_reg:
-                print('Explaining the p4 predictions:')
-                to_explain_reg = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
-                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
-                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
-                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
-                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
-
-                model.set_dest(to_explain_reg["A"])
-
-                big_list_reg = explainer_reg.explain(to_explain_reg)
-                torch.save(big_list_reg, outpath + '/LRP/big_list_reg.pt')
-                torch.save(to_explain_reg, outpath + '/LRP/to_explain_reg.pt')
-
-            if args.LRP_clf:
-                print('Explaining the pid predictions:')
-                to_explain_clf = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
-                                 "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
-                                 "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
-                                 "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
-                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
-
-                model.set_dest(to_explain_clf["A"])
-
-                big_list_clf = explainer_clf.explain(to_explain_clf)
-
-                torch.save(big_list_clf, outpath + '/LRP/big_list_clf.pt')
-                torch.save(to_explain_clf, outpath + '/LRP/to_explain_clf.pt')
-
-            break # explain only one single event
-
-    if args.make_heatmaps_reg:
-        # load the necessary R-scores
-        big_list_reg = torch.load(outpath + '/LRP/big_list_reg.pt', map_location=device)
-        to_explain_reg = torch.load(outpath + '/LRP/to_explain_reg.pt', map_location=device)
-
-        make_heatmaps(big_list_reg, to_explain_reg, 'regression')
-
-    if args.make_heatmaps_clf:
-        # load the necessary R-scores
-        big_list_clf = torch.load(outpath + '/LRP/big_list_clf.pt', map_location=device)
-        to_explain_clf = torch.load(outpath + '/LRP/to_explain_clf.pt', map_location=device)
-
-        make_heatmaps(big_list_clf, to_explain_clf, 'classification')
-
-# # ------------------------------------------------------------------------------------------------
-# # if you got all the intermediate R-score heatmaps stored then you can check if these are equal as a check of conservation across all layers:
-# print(R16[0].sum(axis=1)[0])
-# print(R15[0].sum(axis=1)[0])
-# print(R14[0].sum(axis=1)[0])
-# print(R13[0].sum(axis=1)[0])
-# print(R13[0].sum(axis=1)[0])
-# print(R12[0].sum(axis=1)[0])
-# print(R11[0].sum(axis=1)[0])
-# print(R10[0].sum(axis=1)[0])
-# print(R9[0].sum(axis=1)[0])
-# print(R8[0].sum(axis=1)[0])
-# print(R_score_layer_before_msg_passing[0][0].sum(axis=0).sum())
-# print(R7[0][0].sum(axis=0).sum())
-# print(R6[0][0].sum(axis=1).sum())
-# print(R5[0][0].sum(axis=1).sum())
-# print(R4[0][0].sum(axis=1).sum())
-# print(R3[0][0].sum(axis=1).sum())
-# print(R2[0][0].sum(axis=1).sum())
-# print(R1[0][0].sum(axis=1).sum())
diff --git a/mlpf/pytorch_delphes/__init__.py b/mlpf/pytorch_delphes/__init__.py
index 99f5565a9..48f208f03 100644
--- a/mlpf/pytorch_delphes/__init__.py
+++ b/mlpf/pytorch_delphes/__init__.py
@@ -4,7 +4,6 @@
 
 from pytorch_delphes.model import PFNet7, PFNet7_opt
 from pytorch_delphes.gravnet import GravNetConv
-
 from pytorch_delphes.gravnet_optimized import GravNetConv_optimized
 
 from pytorch_delphes.training import train_loop
diff --git a/mlpf/pytorch_delphes/args.py b/mlpf/pytorch_delphes/args.py
index 12ec28be4..9c44104fe 100644
--- a/mlpf/pytorch_delphes/args.py
+++ b/mlpf/pytorch_delphes/args.py
@@ -4,12 +4,13 @@
 def parse_args():
     parser = argparse.ArgumentParser()
 
-    # from raw -> processed
-    parser.add_argument("--dataset", type=str, default='../../../test_tmp_delphes/data/pythia8_ttbar', help="dataset path", required=False)
-    parser.add_argument("--dataset_qcd", type=str, default='../../../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=False)
-    parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
-    parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
-    parser.add_argument("--num-proc", type=int, default=24, help="number of processes")
+    parser.add_argument("--dataset", type=str, default='../test_tmp_delphes/data/pythia8_ttbar', help="training dataset path", required=True)
+    parser.add_argument("--dataset_qcd", type=str, default='../test_tmp_delphes/data/pythia8_qcd', help="testing dataset path", required=True)
+    parser.add_argument("--outpath", type=str, default = '../test_tmp_delphes/experiments/', help="output folder", required=True)
+    parser.add_argument("--title", type=str, default='', help="Appends this title to the model's name")
+
+    parser.add_argument("--overwrite", action=BoolArg, default=False, help="Overwrites the model if True")
+    parser.add_argument("--optimized", action=BoolArg, default=False, help="Uses the optimized version of knn")
 
     # for training
     parser.add_argument("--train", action=BoolArg, default=True, help="Trains the model")
@@ -17,37 +18,32 @@ def parse_args():
     parser.add_argument("--n_valid", type=int, default=1, help="number of data files to use for validation.. each file contains 100 events")
     parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing.. each file contains 100 events")
     parser.add_argument("--n_epochs", type=int, default=1, help="number of training epochs")
-    parser.add_argument("--patience", type=int, default=100, help="patience before early stopping")
+    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
+
     parser.add_argument("--hidden_dim", type=int, default=256, help="hidden dimension")
     parser.add_argument("--hidden_dim_nn1", type=int, default=64, help="hidden dimension")
     parser.add_argument("--input_encoding", type=int, default=12, help="use an input encoding layer")
     parser.add_argument("--encoding_dim", type=int, default=64, help="encoded element dimension")
-    parser.add_argument("--embedding_dim", type=int, default=0, help="embedding dimension of the type feature (prefered equal to be 3)")
-    parser.add_argument("--encoding_of_clusters", action=BoolArg, default=False, help="Trains an MLP to encode clusters")
-    parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
-    parser.add_argument("--model", type=str, help="type of model to use", default="PFNet7")
+    parser.add_argument("--space_dim", type=int, default=4, help="Spatial dimension for clustering in gravnet layer")
+    parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
+    parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
+    parser.add_argument("--nn1", action=BoolArg, default=True, help="Adds an encoder/decoder step before gravnet..")
+    parser.add_argument("--nn3", action=BoolArg, default=True, help="Adds the network to regress p4..")
+    parser.add_argument("--nn4", action=BoolArg, default=True, help="Adds an extra network for the dnn model..")
+
+    parser.add_argument("--patience", type=int, default=100, help="patience before early stopping")
     parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="gen")
-    parser.add_argument("--outpath", type=str, default = '../../../test_tmp_delphes/experiments/', help="Output folder")
     parser.add_argument("--optimizer", type=str, default='adam', choices=["adam", "adamw"], help="optimizer to use")
     parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
     parser.add_argument("--alpha", type=float, default=2e-4, help="Loss multiplier for pdg-id classification.. recall: loss = clf + alpha*reg")
-    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout rate")
-    parser.add_argument("--space_dim", type=int, default=4, help="Spatial dimension for clustering in gravnet layer")
-    parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
-    parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
-    parser.add_argument("--overwrite", action=BoolArg, default=False, help="Overwrites the model if True")
+
+    parser.add_argument("--classification_only", action=BoolArg, default=False, help="Check to train for classification only (no regression)")
+    parser.add_argument("--regression_only", action=BoolArg, default=False, help="Check to train for regression only (no classification)")
+
+    # for loading a pre-trained model
     parser.add_argument("--load", action=BoolArg, default=False, help="Load the model (no training)")
     parser.add_argument("--load_model", type=str, help="Which model to load", default="/PFNet7_cand_ntrain_2")
     parser.add_argument("--load_epoch", type=int, default=0, help="Which epoch of the model to load for evaluation")
-    parser.add_argument("--classification_only", action=BoolArg, default=False, help="Check to train for classification only (no regression)")
-    parser.add_argument("--regression_only", action=BoolArg, default=False, help="Check to train for regression only (no classification)")
-    parser.add_argument("--nn1", action=BoolArg, default=True, help="Adds an encoder/decoder step before gravnet..")
-    parser.add_argument("--nn3", action=BoolArg, default=True, help="Adds the network to regress p4..")
-    parser.add_argument("--nn4", action=BoolArg, default=True, help="Adds an extra network for the dnn model..")
-    parser.add_argument("--nn0track", action=BoolArg, default=False, help="Adds an initial network that encode the tracks..")
-    parser.add_argument("--nn0cluster", action=BoolArg, default=False, help="Adds an initial network that encode the clusters..")
-    parser.add_argument("--title", type=str, default='', help="Appends this title to the model's name")
-    parser.add_argument("--optimized", action=BoolArg, default=False, help="Uses the optimized version of knn")
 
     # for evaluation: making predictions & making plots
     parser.add_argument("--make_predictions_train", action=BoolArg, default=False, help="make predictions on training data..")
@@ -57,18 +53,6 @@ def parse_args():
     parser.add_argument("--make_plots_valid", action=BoolArg, default=False, help="make plots on validation data..")
     parser.add_argument("--make_plots_test", action=BoolArg, default=True, help="make plots on testing data..")
 
-    # for LRP
-    parser.add_argument("--explain", action=BoolArg, default=True, help="General setup mode: if True then you want to explain.. if False then you will load an already explained model (already made R-scores)..")
-    parser.add_argument("--LRP_load_model", type=str, default="/PFNet7_cand_ntrain_2", help="Loads the model to explain", required=False)
-    parser.add_argument("--LRP_load_epoch", type=int, default=0, help="Loads the epoch after which to explain")
-    parser.add_argument("--LRP_reg", action=BoolArg, default=True, help="Runs LRP for interpreting the regression part..")
-    parser.add_argument("--make_heatmaps_reg", action=BoolArg, default=True, help="Constructs heatmaps for the regressed p4 (must run with explain=True or else you load a pre-explained model with explain=False)..")
-    parser.add_argument("--LRP_clf", action=BoolArg, default=True, help="Runs LRP for interpreting the classification part..")
-    parser.add_argument("--make_heatmaps_clf", action=BoolArg, default=True, help="Constructs heatmaps for the classified pid (must run with explain=True or else you load a pre-explained model with explain=False)..")
-    parser.add_argument("--LRP_outpath", type=str, default = '../../../../test_tmp_delphes/experiments/', help="Output folder")
-    parser.add_argument("--LRP_dataset", type=str, default='../../../../test_tmp_delphes/data/pythia8_ttbar', help="dataset path", required=False)
-    parser.add_argument("--LRP_dataset_qcd", type=str, default='../../../../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=False)
-
     args = parser.parse_args()
 
     return args
diff --git a/mlpf/pytorch_pipeline.py b/mlpf/pytorch_pipeline.py
index 6a28e6922..476832fea 100644
--- a/mlpf/pytorch_pipeline.py
+++ b/mlpf/pytorch_pipeline.py
@@ -1,6 +1,5 @@
 from glob import glob
 import sys, os
-
 import os.path as osp
 import pickle as pkl
 import math, time, tqdm
@@ -38,8 +37,8 @@
 
 import torch_geometric
 
-import pytorch_delphes
-import plotting
+from pytorch_delphes import parse_args, PFGraphDataset, data_to_loader_ttbar, data_to_loader_qcd, PFNet7, PFNet7_opt, train_loop, make_predictions
+from plotting import make_plots
 
 #Ignore divide by 0 errors
 np.seterr(divide='ignore', invalid='ignore')
@@ -64,34 +63,49 @@ def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_si
         title)
     return model_fname
 
+def make_directories_for_plots(outpath, which_data):
+    if not osp.isdir(outpath+'/' + which_data + '_loader'):
+        os.makedirs(outpath+'/' + which_data + '_loader')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/resolution_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/resolution_plots')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/distribution_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/distribution_plots')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/multiplicity_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/multiplicity_plots')
+    if not osp.isdir(outpath+'/' + which_data + '_loader/efficiency_plots'):
+        os.makedirs(outpath+'/' + which_data + '_loader/efficiency_plots')
+
 
 if __name__ == "__main__":
 
-    args = pytorch_delphes.parse_args()
+    args = parse_args()
 
     # # the next part initializes some args values (to run the script not from terminal)
     # class objectview(object):
     #     def __init__(self, d):
     #         self.__dict__ = d
     #
-    # args = objectview({'train': True, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 2, 'patience': 100, 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64,
-    # 'batch_size': 1, 'model': 'PFNet7', 'target': 'gen', 'dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
-    # 'outpath': '../prp/models/yee/', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
-    # 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16, 'overwrite': True,
-    # 'load': False, 'load_epoch': 1, 'load_model': 'PFNet7_gen_ntrain_1_nepochs_2_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'classification_only': False, 'nn1': True, 'nn3': True, 'encoding_of_clusters': False, 'embedding_dim': 0, 'nn0track': False, 'nn0cluster': False, 'title': 'noembeddings',
-    # 'make_predictions_train': False, 'make_plots_train': False, 'make_predictions_valid': False, 'make_plots_valid': False, 'make_predictions_test': True, 'make_plots_test': True,
-    # 'optimized': False})
+    # args = objectview({'train': False, 'n_train': 1, 'n_valid': 1, 'n_test': 1, 'n_epochs': 5, 'batch_size': 1,
+    # 'hidden_dim': 256, 'hidden_dim_nn1': 64, 'input_encoding': 12, 'encoding_dim': 64, 'space_dim': 4, 'propagate_dimensions': 22, 'nearest': 16,
+    # 'patience': 100, 'target': 'gen', 'optimizer': 'adam', 'lr': 0.001, 'alpha': 2e-4,
+    # 'dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    # 'outpath': '../test_tmp_delphes/experiments/yee/', 'title': 'noembeddings',
+    # 'classification_only': False, 'nn1': True, 'nn3': True,
+    # 'load': True, 'load_epoch': 14, 'load_model': 'PFNet7_opt_gen_ntrain_1_nepochs_15_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'make_predictions_train': False, 'make_plots_train': False,
+    # 'make_predictions_valid': False, 'make_plots_valid': False,
+    # 'make_predictions_test': True, 'make_plots_test': True,
+    # 'optimized': False, 'overwrite': False})
 
     # define the dataset (assumes the data exists as .pt files in "processed")
     print('Processing the data..')
-    full_dataset_ttbar = pytorch_delphes.PFGraphDataset(args.dataset)
-    full_dataset_qcd = pytorch_delphes.PFGraphDataset(args.dataset_qcd)
+    full_dataset_ttbar = PFGraphDataset(args.dataset)
+    full_dataset_qcd = PFGraphDataset(args.dataset_qcd)
 
     # constructs a loader from the data to iterate over batches
     print('Constructing data loaders..')
-    train_loader, valid_loader = pytorch_delphes.data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
-    test_loader = pytorch_delphes.data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
+    train_loader, valid_loader = data_to_loader_ttbar(full_dataset_ttbar, args.n_train, args.n_valid, batch_size=args.batch_size)
+    test_loader = data_to_loader_qcd(full_dataset_qcd, args.n_test, batch_size=args.batch_size)
 
     # element parameters
     input_dim = 12
@@ -101,31 +115,20 @@ def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_si
     output_dim_p4 = 6
 
     if args.optimized:
-        model_classes = {"PFNet7": pytorch_delphes.PFNet7_opt}
+        model_class = PFNet7_opt
     else:
-        model_classes = {"PFNet7": pytorch_delphes.PFNet7}
-
-    model_class = model_classes[args.model]
-    model_kwargs = {'input_dim': input_dim,
-                    'hidden_dim': args.hidden_dim,
-                    'hidden_dim_nn1': args.hidden_dim_nn1,
-                    'input_encoding': args.input_encoding,
-                    'encoding_dim': args.encoding_dim,
-                    'output_dim_id': output_dim_id,
-                    'output_dim_p4': output_dim_p4,
-                    'space_dim': args.space_dim,
-                    'propagate_dimensions': args.propagate_dimensions,
-                    'nearest': args.nearest,
-                    'target': args.target,
-                    'nn1': args.nn1,
-                    'nn3': args.nn3}
+        model_class = PFNet7
 
     if args.load:
-            print('Loading a previously trained model..')
-            model = model_class(**model_kwargs)
             outpath = args.outpath + args.load_model
             PATH = outpath + '/epoch_' + str(args.load_epoch) + '_weights.pth'
 
+            print('Loading a previously trained model..')
+            with open(outpath + '/model_kwargs.pkl', 'rb') as f:
+                model_kwargs = pkl.load(f)
+
+            model = model_class(**model_kwargs)
+
             state_dict = torch.load(PATH, map_location=device)
 
             if "DataParallel" in args.load_model:   # if the model was trained using DataParallel then we do this
@@ -152,6 +155,20 @@ def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_si
     elif args.train:
         #instantiate the model
         print('Instantiating a model..')
+        model_kwargs = {'input_dim': input_dim,
+                        'hidden_dim': args.hidden_dim,
+                        'hidden_dim_nn1': args.hidden_dim_nn1,
+                        'input_encoding': args.input_encoding,
+                        'encoding_dim': args.encoding_dim,
+                        'output_dim_id': output_dim_id,
+                        'output_dim_p4': output_dim_p4,
+                        'space_dim': args.space_dim,
+                        'propagate_dimensions': args.propagate_dimensions,
+                        'nearest': args.nearest,
+                        'target': args.target,
+                        'nn1': args.nn1,
+                        'nn3': args.nn3}
+
         model = model_class(**model_kwargs)
 
         if multi_gpu:
@@ -162,11 +179,12 @@ def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_si
         model.to(device)
 
     if args.train:
-        args.title=args.title+'noskip'
         if args.nn1:
             args.title=args.title+'_nn1'
         if args.nn3:
             args.title=args.title+'_nn3'
+        if args.load:
+            args.title=args.title+'_retrain'
 
         if args.classification_only:
             model_fname = get_model_fname(args.dataset, model, args.n_train, args.n_epochs, args.lr, args.target, args.batch_size, args.alpha, "clf", args.title)
@@ -202,69 +220,39 @@ def get_model_fname(dataset, model, n_train, n_epochs, lr, target_type, batch_si
         print(model_fname)
 
         model.train()
-        pytorch_delphes.train_loop(model, device, multi_gpu,
-                                   train_loader, valid_loader, test_loader,
-                                   args.n_epochs, args.patience, optimizer, args.alpha, args.target,
-                                   output_dim_id, args.classification_only, outpath)
+        train_loop(model, device, multi_gpu,
+                   train_loader, valid_loader, test_loader,
+                   args.n_epochs, args.patience, optimizer, args.alpha, args.target,
+                   output_dim_id, args.classification_only, outpath)
 
     model.eval()
 
     # evaluate on training data..
-    if not osp.isdir(outpath+'/train_loader'):
-        os.makedirs(outpath+'/train_loader')
-    if not osp.isdir(outpath+'/train_loader/resolution_plots'):
-        os.makedirs(outpath+'/train_loader/resolution_plots')
-    if not osp.isdir(outpath+'/train_loader/distribution_plots'):
-        os.makedirs(outpath+'/train_loader/distribution_plots')
-    if not osp.isdir(outpath+'/train_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/train_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/train_loader/efficiency_plots'):
-        os.makedirs(outpath+'/train_loader/efficiency_plots')
-
+    make_directories_for_plots(outpath, 'train')
     if args.make_predictions_train:
-        pytorch_delphes.make_predictions(model, multi_gpu, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+        make_predictions(model, multi_gpu, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
     if args.make_plots_train:
-        plotting.make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
+        make_plots(model, train_loader, outpath+'/train_loader', args.target, device, args.n_epochs, which_data="training data")
 
     # evaluate on validation data..
-    if not osp.isdir(outpath+'/valid_loader'):
-        os.makedirs(outpath+'/valid_loader')
-    if not osp.isdir(outpath+'/valid_loader/resolution_plots'):
-        os.makedirs(outpath+'/valid_loader/resolution_plots')
-    if not osp.isdir(outpath+'/valid_loader/distribution_plots'):
-        os.makedirs(outpath+'/valid_loader/distribution_plots')
-    if not osp.isdir(outpath+'/valid_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/valid_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/valid_loader/efficiency_plots'):
-        os.makedirs(outpath+'/valid_loader/efficiency_plots')
-
+    make_directories_for_plots(outpath, 'valid')
     if args.make_predictions_valid:
-        pytorch_delphes.make_predictions(model, multi_gpu, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+        make_predictions(model, multi_gpu, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
     if args.make_plots_valid:
-        plotting.make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
+        make_plots(model, valid_loader, outpath+'/valid_loader', args.target, device, args.n_epochs, which_data="validation data")
 
     # evaluate on testing data..
-    if not osp.isdir(outpath+'/test_loader'):
-        os.makedirs(outpath+'/test_loader')
-    if not osp.isdir(outpath+'/test_loader/resolution_plots'):
-        os.makedirs(outpath+'/test_loader/resolution_plots')
-    if not osp.isdir(outpath+'/test_loader/distribution_plots'):
-        os.makedirs(outpath+'/test_loader/distribution_plots')
-    if not osp.isdir(outpath+'/test_loader/multiplicity_plots'):
-        os.makedirs(outpath+'/test_loader/multiplicity_plots')
-    if not osp.isdir(outpath+'/test_loader/efficiency_plots'):
-        os.makedirs(outpath+'/test_loader/efficiency_plots')
-
+    make_directories_for_plots(outpath, 'test')
     if args.make_predictions_test:
         if args.load:
-            pytorch_delphes.make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+            make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
         else:
-            pytorch_delphes.make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+            make_predictions(model, multi_gpu, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
     if args.make_plots_test:
         if args.load:
-            plotting.make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.load_epoch, which_data="testing data")
         else:
-            plotting.make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
+            make_plots(model, test_loader, outpath+'/test_loader', args.target, device, args.n_epochs, which_data="testing data")
 
 
 ## -----------------------------------------------------------

From f199701fef858ba68149877a3477bf608ff5e976 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 10:58:25 -0700
Subject: [PATCH 103/157] lower case lrp + fix local_test.sh

---
 mlpf/LRP/LRP_clf_gpu.py               | 14 +++----
 mlpf/LRP/LRP_reg_gpu.py               | 24 ++++++------
 mlpf/LRP/__init__.py                  | 14 +++----
 mlpf/LRP/args.py                      | 16 ++++----
 mlpf/LRP/model_LRP.py                 |  4 +-
 mlpf/LRP/plots.py                     | 20 +++++-----
 mlpf/LRP_pipeline.py                  | 56 ++++++++++++++-------------
 scripts/local_test_delphes_pytorch.sh |  6 +--
 8 files changed, 79 insertions(+), 75 deletions(-)

diff --git a/mlpf/LRP/LRP_clf_gpu.py b/mlpf/LRP/LRP_clf_gpu.py
index a844e5b64..831a8fba7 100644
--- a/mlpf/LRP/LRP_clf_gpu.py
+++ b/mlpf/LRP/LRP_clf_gpu.py
@@ -16,20 +16,20 @@
 from torch_geometric.utils.convert import to_networkx
 from torch_geometric.utils import to_dense_adj
 
-import LRP
+import lrp
 
-class LRP_clf:
+class lrp_clf:
     EPSILON=1e-9
 
-    def __init__(self, device, model:LRP.model_io):
+    def __init__(self, device, model:lrp.model_io):
         self.device=device
         self.model=model
 
-    def register_model(model:LRP.model_io):
+    def register_model(model:lrp.model_io):
         self.model=model
 
     """
-    LRP rules
+    lrp rules
     """
 
     # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
@@ -166,7 +166,7 @@ def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weigh
             print("- Adjacency matrix is correctly computed")
 
         # # the following saves a version of the R-scores before the message passing
-        # torch.save(big_list, outpath + '/LRP/R_score_layer_before_msg_passing.pt')
+        # torch.save(big_list, outpath + '/lrp/R_score_layer_before_msg_passing.pt')
 
         # modify the big tensor based on message passing rule
         for node_i in tqdm(range(len(big_list))):
@@ -188,7 +188,7 @@ def explain(self, to_explain):
         print('Total number of layers (including activation layers):', start_index)
 
         # store the R-scores for the output layer (they are basically the model predictions)
-        torch.save(to_explain["pred_id"].detach(), outpath + f'/LRP/R_score_layer{start_index+1}.pt')
+        torch.save(to_explain["pred_id"].detach(), outpath + f'/lrp/R_score_layer{start_index+1}.pt')
 
         ### loop over each single layer
         big_list = []
diff --git a/mlpf/LRP/LRP_reg_gpu.py b/mlpf/LRP/LRP_reg_gpu.py
index a67a5c91a..9f3ca31ab 100644
--- a/mlpf/LRP/LRP_reg_gpu.py
+++ b/mlpf/LRP/LRP_reg_gpu.py
@@ -16,20 +16,20 @@
 from torch_geometric.utils.convert import to_networkx
 from torch_geometric.utils import to_dense_adj
 
-import LRP
+import lrp
 
-class LRP_reg:
+class lrp_reg:
     EPSILON=1e-9
 
-    def __init__(self, device, model:LRP.model_io):
+    def __init__(self, device, model:lrp.model_io):
         self.device=device
         self.model=model
 
-    def register_model(model:LRP.model_io):
+    def register_model(model:lrp.model_io):
         self.model=model
 
     """
-    LRP rules
+    lrp rules
     """
 
     # this rule is wrong.. it is just here because it is much quicker for experimentation and gives the correct dimensions needed for debugging (if you haven't hit the message passing step)
@@ -185,7 +185,7 @@ def message_passing_rule(self, layer, input, R, big_list, edge_index, edge_weigh
             print("- Adjacency matrix is correctly computed")
 
         # # the following saves a version of the R-scores before the message passing
-        # torch.save(big_list, outpath + '/LRP/R_score_layer_before_msg_passing.pt')
+        # torch.save(big_list, outpath + '/lrp/R_score_layer_before_msg_passing.pt')
 
         # modify the big tensor based on message passing rule
         for node_i in tqdm(range(len(big_list))):
@@ -212,7 +212,7 @@ def explain(self,
         print('Total number of layers (including activation layers):', start_index)
 
         # store the R-scores for the output layer (they are basically the model predictions)
-        torch.save(to_explain["pred_p4"].detach(), outpath + f'/LRP/R_score_layer{start_index+1}.pt')
+        torch.save(to_explain["pred_p4"].detach(), outpath + f'/lrp/R_score_layer{start_index+1}.pt')
 
         ### loop over each single layer
         big_list = []
@@ -225,7 +225,7 @@ def explain(self,
         return big_list      # returns the heatmaps for layer0 (i.e. input features)
 
     def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_index, index=None, name=None):
-        # preparing variables required for computing LRP
+        # preparing variables required for computing lrp
         layer = self.model.get_layer(index=index,name=name)
 
         if name is None:
@@ -246,8 +246,8 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer} - Skip connection")
             input_relevance, pid_relevance, embedding_relevance = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True, skip_connection=True)
 
-            torch.save(input_relevance, outpath + f'/LRP/input_relevance.pt')
-            torch.save(embedding_relevance, outpath + f'/LRP/embedding_relevance.pt')
+            torch.save(input_relevance, outpath + f'/lrp/input_relevance.pt')
+            torch.save(embedding_relevance, outpath + f'/lrp/embedding_relevance.pt')
 
             return pid_relevance, big_list
 
@@ -257,7 +257,7 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             R = self.eps_rule(self, layer, input, R, index, output_layer_bool, activation_layer=False, print_statement=True)
 
             # add the embedding_relevance computed in the nn3.0 skip connection
-            embedding_relevance = torch.load(outpath + f'/LRP/embedding_relevance.pt', map_location=torch.self.device('cpu'))
+            embedding_relevance = torch.load(outpath + f'/lrp/embedding_relevance.pt', map_location=torch.self.device('cpu'))
 
             for i in range(len(R)):
                 R[i] = R[i] + embedding_relevance[i]
@@ -269,7 +269,7 @@ def explain_single_layer(self, R, to_explain, big_list, outpath, output_layer_in
             print(f"Explaining layer {output_layer_index+1-index}/{output_layer_index-1}: {layer}")
 
             # add the input_relevance computed in the nn3.0 skip connection
-            input_relevance = torch.load(outpath + f'/LRP/input_relevance.pt', map_location=torch.self.device('cpu'))
+            input_relevance = torch.load(outpath + f'/lrp/input_relevance.pt', map_location=torch.self.device('cpu'))
 
             for node_i in tqdm(range(len(big_list))):
                 big_list[node_i] = self.eps_rule(self, layer, input, big_list[node_i], index, output_layer_bool, activation_layer=False, print_statement=False)
diff --git a/mlpf/LRP/__init__.py b/mlpf/LRP/__init__.py
index f8e5e19ee..d59653ee0 100644
--- a/mlpf/LRP/__init__.py
+++ b/mlpf/LRP/__init__.py
@@ -1,8 +1,8 @@
-from LRP.args import parse_args
-from LRP.plots import make_heatmaps
-from LRP.model_io import model_io
-from LRP.model_LRP import PFNet7
-from LRP.gravnet_LRP import GravNetConv
+from lrp.args import parse_args
+from lrp.plots import make_heatmaps
+from lrp.model_io import model_io
+from lrp.model_lrp import PFNet7
+from lrp.gravnet_lrp import GravNetConv
 
-from LRP.LRP_clf_gpu import LRP_clf
-from LRP.LRP_reg_gpu import LRP_reg
+from lrp.lrp_clf_gpu import lrp_clf
+from lrp.lrp_reg_gpu import lrp_reg
diff --git a/mlpf/LRP/args.py b/mlpf/LRP/args.py
index 8840fec67..61b0fed9e 100644
--- a/mlpf/LRP/args.py
+++ b/mlpf/LRP/args.py
@@ -4,11 +4,11 @@
 def parse_args():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--LRP_dataset_qcd", type=str, default='../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=True)
-    parser.add_argument("--LRP_outpath", type=str, default = '../test_tmp_delphes/experiments/LRP/', help="Output folder for the LRP relevance scores and heatmaps", required=True)
+    parser.add_argument("--lrp_dataset_qcd", type=str, default='../test_tmp_delphes/data/pythia8_qcd', help="dataset path", required=True)
+    parser.add_argument("--lrp_outpath", type=str, default = '../test_tmp_delphes/experiments/lrp/', help="Output folder for the lrp relevance scores and heatmaps", required=True)
 
     # usual specs
-    parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing LRP.. each file contains 100 events")
+    parser.add_argument("--n_test", type=int, default=2, help="number of data files to use for testing lrp.. each file contains 100 events")
     parser.add_argument("--batch_size", type=int, default=1, help="Number of .pt files to load in parallel")
 
     parser.add_argument("--hidden_dim", type=int, default=256, help="hidden dimension")
@@ -19,13 +19,13 @@ def parse_args():
     parser.add_argument("--propagate_dimensions", type=int, default=22, help="The number of features to be propagated between the vertices")
     parser.add_argument("--nearest", type=int, default=16, help="k nearest neighbors in gravnet layer")
 
-    # extras for LRP
+    # extras for lrp
     parser.add_argument("--explain", action=BoolArg, default=True, help="General setup mode: if True then you want to explain.. if False then you will load an already explained model (already made R-scores)..")
-    parser.add_argument("--LRP_reg", action=BoolArg, default=True, help="Works only if --explain is True.. Runs LRP for interpreting the regression part..")
-    parser.add_argument("--LRP_clf", action=BoolArg, default=True, help="Works only if --explain is True.. Runs LRP for interpreting the classification part..")
+    parser.add_argument("--lrp_reg", action=BoolArg, default=True, help="Works only if --explain is True.. Runs lrp for interpreting the regression part..")
+    parser.add_argument("--lrp_clf", action=BoolArg, default=True, help="Works only if --explain is True.. Runs lrp for interpreting the classification part..")
 
-    parser.add_argument("--LRP_load_model", type=str, default="/PFNet7_cand_ntrain_2", help="Loads the model to explain (or only make_heatmaps)", required=False)
-    parser.add_argument("--LRP_load_epoch", type=int, default=0, help="Loads the epoch after which to explain (or only make_heatmaps)")
+    parser.add_argument("--lrp_load_model", type=str, default="/PFNet7_cand_ntrain_2", help="Loads the model to explain (or only make_heatmaps)", required=False)
+    parser.add_argument("--lrp_load_epoch", type=int, default=0, help="Loads the epoch after which to explain (or only make_heatmaps)")
 
     parser.add_argument("--make_heatmaps_reg", action=BoolArg, default=True, help="Constructs heatmaps for the regressed p4 (must run with explain=True or else you load a pre-explained model with explain=False)..")
     parser.add_argument("--make_heatmaps_clf", action=BoolArg, default=True, help="Constructs heatmaps for the classified pid (must run with explain=True or else you load a pre-explained model with explain=False)..")
diff --git a/mlpf/LRP/model_LRP.py b/mlpf/LRP/model_LRP.py
index af6293f44..502e7b93a 100644
--- a/mlpf/LRP/model_LRP.py
+++ b/mlpf/LRP/model_LRP.py
@@ -11,7 +11,7 @@
 from torch_geometric.nn import TopKPooling, SAGPooling, SGConv
 from torch.nn import Sequential as Seq, Linear as Lin, ReLU
 
-import LRP
+import lrp
 
 #Model with gravnet clustering
 class PFNet7(nn.Module):
@@ -38,7 +38,7 @@ def __init__(self,
         )
 
         # (2) CNN: Gravnet layer
-        self.conv1 = LRP.GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
+        self.conv1 = lrp.GravNetConv(input_encoding, encoding_dim, space_dim, propagate_dimensions, nearest)
 
         # (3) DNN layer: classifying PID
         self.nn2 = nn.Sequential(
diff --git a/mlpf/LRP/plots.py b/mlpf/LRP/plots.py
index 406db618b..d9927e1d3 100644
--- a/mlpf/LRP/plots.py
+++ b/mlpf/LRP/plots.py
@@ -55,17 +55,17 @@ def make_heatmaps(big_list, to_explain, device, outpath, output_dim_id, output_d
 
     # make directories to hold the heatmaps
     for i in range(6):
-        if not osp.isdir(outpath + '/LRP'):
-            os.makedirs(outpath + '/LRP')
-        if not osp.isdir(outpath + f'/LRP/class{str(i)}'):
-            os.makedirs(outpath + f'/LRP/class{str(i)}')
+        if not osp.isdir(outpath + '/lrp'):
+            os.makedirs(outpath + '/lrp')
+        if not osp.isdir(outpath + f'/lrp/class{str(i)}'):
+            os.makedirs(outpath + f'/lrp/class{str(i)}')
         for j in range(6):
             if task=='regression':
-                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}'):
-                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/p4_elem{str(j)}')
+                if not osp.isdir(outpath + f'/lrp/class{str(i)}'+f'/p4_elem{str(j)}'):
+                    os.makedirs(outpath + f'/lrp/class{str(i)}'+f'/p4_elem{str(j)}')
             elif task=='classification':
-                if not osp.isdir(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}'):
-                    os.makedirs(outpath + f'/LRP/class{str(i)}'+f'/pid{str(j)}')
+                if not osp.isdir(outpath + f'/lrp/class{str(i)}'+f'/pid{str(j)}'):
+                    os.makedirs(outpath + f'/lrp/class{str(i)}'+f'/pid{str(j)}')
 
     # attempt to break down big_list onto 6 smaller lists, 1 for each pid
     list0, list1, list2, list3, list4, list5 = [], [], [], [], [], []
@@ -160,7 +160,7 @@ def get_type(t):
                 plt.colorbar()
                 fig.set_size_inches(12, 12)
                 if task=='regression':
-                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                    plt.savefig(outpath + f'/lrp/class{str(pid)}'+f'/p4_elem{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
                 elif task=='classification':
-                    plt.savefig(outpath + f'/LRP/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
+                    plt.savefig(outpath + f'/lrp/class{str(pid)}'+f'/pid{str(output_neuron)}'+f'/sample{str(node_i)}.jpg')
                 plt.close(fig)
diff --git a/mlpf/LRP_pipeline.py b/mlpf/LRP_pipeline.py
index 73cf8abf1..e57d9c8e7 100644
--- a/mlpf/LRP_pipeline.py
+++ b/mlpf/LRP_pipeline.py
@@ -39,7 +39,7 @@
 import torch.nn as nn
 
 from pytorch_delphes import PFGraphDataset, data_to_loader_ttbar, data_to_loader_qcd
-from LRP import parse_args, make_heatmaps, model_io, PFNet7, LRP_clf, LRP_reg
+from lrp import parse_args, make_heatmaps, model_io, PFNet7, lrp_clf, lrp_reg
 
 # NOTE: this script works by loading an already trained model with very specefic specs
 
@@ -54,15 +54,15 @@
     #
     # args = objectview({'n_test': 2, 'batch_size': 1,' hidden_dim':256, 'hidden_dim_nn1': 64,
     # 'input_encoding': 12, 'encoding_dim': 64, 'space_dim': 4, 'propagate_dimensions': 22,'nearest': 16,
-    # 'LRP_dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'LRP_dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
-    # 'LRP_outpath': '../test_tmp_delphes/experiments/LRP/',
-    # 'LRP_load_epoch': 9, 'LRP_load_model': 'LRP_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
-    # 'explain': False, 'LRP_clf': False, 'LRP_reg': False,
+    # 'lrp_dataset': '../test_tmp_delphes/data/pythia8_ttbar', 'lrp_dataset_qcd': '../test_tmp_delphes/data/pythia8_qcd',
+    # 'lrp_outpath': '../test_tmp_delphes/experiments/lrp/',
+    # 'lrp_load_epoch': 9, 'lrp_load_model': 'lrp_reg_PFNet7_gen_ntrain_1_nepochs_10_batch_size_1_lr_0.001_alpha_0.0002_both_noembeddingsnoskip_nn1_nn3',
+    # 'explain': True, 'lrp_clf': False, 'lrp_reg': False,
     # 'make_heatmaps_clf': True,'make_heatmaps_reg': True})
 
     # define the dataset (assumes the data exists as .pt files in "processed")
     print('Processing the data..')
-    full_dataset_qcd = PFGraphDataset(args.LRP_dataset_qcd)
+    full_dataset_qcd = PFGraphDataset(args.lrp_dataset_qcd)
 
     # constructs a loader from the data to iterate over batches
     print('Constructing data loader..')
@@ -75,8 +75,8 @@
     output_dim_id = 6
     output_dim_p4 = 6
 
-    outpath = args.LRP_outpath + args.LRP_load_model
-    PATH = outpath + '/epoch_' + str(args.LRP_load_epoch) + '_weights.pth'
+    outpath = args.lrp_outpath + args.lrp_load_model
+    PATH = outpath + '/epoch_' + str(args.lrp_load_epoch) + '_weights.pth'
 
     # loading the model
     print('Loading a previously trained model..')
@@ -88,7 +88,7 @@
     state_dict = torch.load(PATH, map_location=device)
 
     # if model was trained using DataParallel then we have to load it differently
-    if "DataParallel" in args.LRP_load_model:
+    if "DataParallel" in args.lrp_load_model:
         state_dict = torch.load(PATH, map_location=device)
         from collections import OrderedDict
         new_state_dict = OrderedDict()
@@ -127,60 +127,64 @@ def hook(model, input, output):
 
             if i==0:
                 # code can be written better
-                # basically i run at least one forward pass to get the activations to use their shape in defining the LRP layers
+                # basically i run at least one forward pass to get the activations to use their shape in defining the lrp layers
                 pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model(X)
                 model = model_io(device, model, state_dict, dict(), activation)
-                explainer_reg = LRP_reg(device, model)
-                explainer_clf = LRP_clf(device, model)
+                explainer_reg = lrp_reg(device, model)
+                explainer_clf = lrp_clf(device, model)
 
             else:
                 pred_ids_one_hot, pred_p4, gen_ids_one_hot, gen_p4, cand_ids_one_hot, cand_p4, edge_index, edge_weight, after_message, before_message = model.model(X)
 
-            if not osp.isdir(outpath + '/LRP'):
-                os.makedirs(outpath + '/LRP')
+            if not osp.isdir(outpath + '/lrp'):
+                os.makedirs(outpath + '/lrp')
 
-            if args.LRP_reg:
+            if (not args.lrp_reg) & (not args.lrp_clf):
+                print('EXITING: Did not specefy wether to explain lrp_reg or lrp_clf')
+                sys.exit(0)
+
+            if args.lrp_reg:
                 print('Explaining the p4 predictions:')
                 to_explain_reg = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
                                  "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
                                  "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
                                  "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
-                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
+                                 "outpath": args.lrp_outpath, "load_model": args.lrp_load_model}
 
                 model.set_dest(to_explain_reg["A"])
 
                 big_list_reg = explainer_reg.explain(to_explain_reg)
-                torch.save(big_list_reg, outpath + '/LRP/big_list_reg.pt')
-                torch.save(to_explain_reg, outpath + '/LRP/to_explain_reg.pt')
+                torch.save(big_list_reg, outpath + '/lrp/big_list_reg.pt')
+                torch.save(to_explain_reg, outpath + '/lrp/to_explain_reg.pt')
 
-            if args.LRP_clf:
+            if args.lrp_clf:
                 print('Explaining the pid predictions:')
                 to_explain_clf = {"A": activation, "inputs": dict(x=X.x,batch=X.batch),
                                  "gen_p4": gen_p4.detach(), "gen_id": gen_ids_one_hot.detach(),
                                  "pred_p4": pred_p4.detach(), "pred_id": pred_ids_one_hot.detach(),
                                  "edge_index": edge_index.detach(), "edge_weight": edge_weight.detach(), "after_message": after_message.detach(), "before_message": before_message.detach(),
-                                 "outpath": args.LRP_outpath, "load_model": args.LRP_load_model}
+                                 "outpath": args.lrp_outpath, "load_model": args.lrp_load_model}
 
                 model.set_dest(to_explain_clf["A"])
 
                 big_list_clf = explainer_clf.explain(to_explain_clf)
 
-                torch.save(big_list_clf, outpath + '/LRP/big_list_clf.pt')
-                torch.save(to_explain_clf, outpath + '/LRP/to_explain_clf.pt')
+                torch.save(big_list_clf, outpath + '/lrp/big_list_clf.pt')
+                torch.save(to_explain_clf, outpath + '/lrp/to_explain_clf.pt')
 
             break # explain only one single event
 
     if args.make_heatmaps_reg:
         # load the necessary R-scores
-        big_list_reg = torch.load(outpath + '/LRP/big_list_reg.pt', map_location=device)
-        to_explain_reg = torch.load(outpath + '/LRP/to_explain_reg.pt', map_location=device)
+        big_list_reg = torch.load(outpath + '/lrp/big_list_reg.pt', map_location=device)
+        to_explain_reg = torch.load(outpath + '/lrp/to_explain_reg.pt', map_location=device)
 
         make_heatmaps(big_list_reg, to_explain_reg, device, outpath, output_dim_id, output_dim_p4, 'regression')
 
     if args.make_heatmaps_clf:
         # load the necessary R-scores
-        big_list_clf = torch.load(outpath + '/LRP/big_list_clf.pt', map_location=device)
-        to_explain_clf = torch.load(outpath + '/LRP/to_explain_clf.pt', map_location=device)
+        big_list_clf = torch.load(outpath + '/lrp/big_list_clf.pt', map_location=device)
+        to_explain_clf = torch.load(outpath + '/lrp/to_explain_clf.pt', map_location=device)
 
         make_heatmaps(big_list_clf, to_explain_clf, device, outpath, output_dim_id, output_dim_p4, 'classification')
 
diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index 8eda5c117..a8069430a 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -48,6 +48,6 @@ cd ../mlpf/
 echo Beginning the training..
 python3 pytorch_pipeline.py \
   --n_epochs=10 --n_train=1 --n_valid=1 --n_test=1 --batch_size=4 \
-  --dataset='../../test_tmp_delphes/data/pythia8_ttbar' \
-  --dataset_qcd='../../test_tmp_delphes/data/pythia8_qcd' \
-  --outpath='../../test_tmp_delphes/experiments'
+  --dataset='../test_tmp_delphes/data/pythia8_ttbar' \
+  --dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
+  --outpath='../test_tmp_delphes/experiments'

From 7bbe245db51ca30125536e62ea52152e2be9b939 Mon Sep 17 00:00:00 2001
From: Farouk Mokhtar <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 10:59:38 -0700
Subject: [PATCH 104/157] Rename LRP_clf_gpu.py to lrp_clf_gpu.py

---
 mlpf/LRP/{LRP_clf_gpu.py => lrp_clf_gpu.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename mlpf/LRP/{LRP_clf_gpu.py => lrp_clf_gpu.py} (100%)

diff --git a/mlpf/LRP/LRP_clf_gpu.py b/mlpf/LRP/lrp_clf_gpu.py
similarity index 100%
rename from mlpf/LRP/LRP_clf_gpu.py
rename to mlpf/LRP/lrp_clf_gpu.py

From 73b3e54b979f46d04f9becec1bc2a661e3d11924 Mon Sep 17 00:00:00 2001
From: Farouk Mokhtar <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 10:59:53 -0700
Subject: [PATCH 105/157] Rename LRP_reg_gpu.py to lrp_reg_gpu.py

---
 mlpf/LRP/{LRP_reg_gpu.py => lrp_reg_gpu.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename mlpf/LRP/{LRP_reg_gpu.py => lrp_reg_gpu.py} (100%)

diff --git a/mlpf/LRP/LRP_reg_gpu.py b/mlpf/LRP/lrp_reg_gpu.py
similarity index 100%
rename from mlpf/LRP/LRP_reg_gpu.py
rename to mlpf/LRP/lrp_reg_gpu.py

From 4f5dea75056b2a6bab39d8dc5048c628232952ae Mon Sep 17 00:00:00 2001
From: Farouk Mokhtar <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 11:01:38 -0700
Subject: [PATCH 106/157] Rename LRP_pipeline.py to lrp_pipeline.py

---
 mlpf/{LRP_pipeline.py => lrp_pipeline.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename mlpf/{LRP_pipeline.py => lrp_pipeline.py} (100%)

diff --git a/mlpf/LRP_pipeline.py b/mlpf/lrp_pipeline.py
similarity index 100%
rename from mlpf/LRP_pipeline.py
rename to mlpf/lrp_pipeline.py

From 3f7880475a37da15b6329bd4745fbe01ca061a86 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 11:04:21 -0700
Subject: [PATCH 107/157] attempt to rename LRP to lrp through a tmp

---
 mlpf/{LRP => tmp}/__init__.py                   | 0
 mlpf/{LRP => tmp}/args.py                       | 0
 mlpf/{LRP/gravnet_LRP.py => tmp/gravnet_lrp.py} | 0
 mlpf/{LRP/LRP_clf_gpu.py => tmp/lrp_clf_gpu.py} | 0
 mlpf/{LRP/LRP_reg_gpu.py => tmp/lrp_reg_gpu.py} | 0
 mlpf/{LRP => tmp}/model_io.py                   | 0
 mlpf/{LRP/model_LRP.py => tmp/model_lrp.py}     | 0
 mlpf/{LRP => tmp}/plots.py                      | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 rename mlpf/{LRP => tmp}/__init__.py (100%)
 rename mlpf/{LRP => tmp}/args.py (100%)
 rename mlpf/{LRP/gravnet_LRP.py => tmp/gravnet_lrp.py} (100%)
 rename mlpf/{LRP/LRP_clf_gpu.py => tmp/lrp_clf_gpu.py} (100%)
 rename mlpf/{LRP/LRP_reg_gpu.py => tmp/lrp_reg_gpu.py} (100%)
 rename mlpf/{LRP => tmp}/model_io.py (100%)
 rename mlpf/{LRP/model_LRP.py => tmp/model_lrp.py} (100%)
 rename mlpf/{LRP => tmp}/plots.py (100%)

diff --git a/mlpf/LRP/__init__.py b/mlpf/tmp/__init__.py
similarity index 100%
rename from mlpf/LRP/__init__.py
rename to mlpf/tmp/__init__.py
diff --git a/mlpf/LRP/args.py b/mlpf/tmp/args.py
similarity index 100%
rename from mlpf/LRP/args.py
rename to mlpf/tmp/args.py
diff --git a/mlpf/LRP/gravnet_LRP.py b/mlpf/tmp/gravnet_lrp.py
similarity index 100%
rename from mlpf/LRP/gravnet_LRP.py
rename to mlpf/tmp/gravnet_lrp.py
diff --git a/mlpf/LRP/LRP_clf_gpu.py b/mlpf/tmp/lrp_clf_gpu.py
similarity index 100%
rename from mlpf/LRP/LRP_clf_gpu.py
rename to mlpf/tmp/lrp_clf_gpu.py
diff --git a/mlpf/LRP/LRP_reg_gpu.py b/mlpf/tmp/lrp_reg_gpu.py
similarity index 100%
rename from mlpf/LRP/LRP_reg_gpu.py
rename to mlpf/tmp/lrp_reg_gpu.py
diff --git a/mlpf/LRP/model_io.py b/mlpf/tmp/model_io.py
similarity index 100%
rename from mlpf/LRP/model_io.py
rename to mlpf/tmp/model_io.py
diff --git a/mlpf/LRP/model_LRP.py b/mlpf/tmp/model_lrp.py
similarity index 100%
rename from mlpf/LRP/model_LRP.py
rename to mlpf/tmp/model_lrp.py
diff --git a/mlpf/LRP/plots.py b/mlpf/tmp/plots.py
similarity index 100%
rename from mlpf/LRP/plots.py
rename to mlpf/tmp/plots.py

From ea2579813b3fc7a24500c452bb52453f9b2abd7f Mon Sep 17 00:00:00 2001
From: Farouk Mokhtar <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 11:06:29 -0700
Subject: [PATCH 108/157] Rename LRP_pipeline.py to lrp_pipeline.py

---
 mlpf/{LRP_pipeline.py => lrp_pipeline.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename mlpf/{LRP_pipeline.py => lrp_pipeline.py} (100%)

diff --git a/mlpf/LRP_pipeline.py b/mlpf/lrp_pipeline.py
similarity index 100%
rename from mlpf/LRP_pipeline.py
rename to mlpf/lrp_pipeline.py

From fb29c20008d62f979b792b8b5d3729d29b75e840 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 11:12:36 -0700
Subject: [PATCH 109/157] adding lrp to the quick testing bash script

---
 scripts/local_test_delphes_pytorch.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index a8069430a..68690b987 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -51,3 +51,12 @@ python3 pytorch_pipeline.py \
   --dataset='../test_tmp_delphes/data/pythia8_ttbar' \
   --dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
   --outpath='../test_tmp_delphes/experiments'
+echo Finished the training..
+
+echo Begining the LRP machinery..
+python3 lrp_pipeline.py \
+  --n_test=1 --batch_size=4 \
+  --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
+  --lrp_outpath='../test_tmp_delphes/data/experiments/' \
+  --lrp_load_model='../test_tmp_delphes/data/experiments/*'
+  --lrp_load_epoch=9

From 768db2f15bbe264f059c764c9e438f115448e66b Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 11:33:29 -0700
Subject: [PATCH 110/157] fix name of model

---
 scripts/local_test_delphes_pytorch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index 68690b987..b99505be8 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -58,5 +58,5 @@ python3 lrp_pipeline.py \
   --n_test=1 --batch_size=4 \
   --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
   --lrp_outpath='../test_tmp_delphes/data/experiments/' \
-  --lrp_load_model='../test_tmp_delphes/data/experiments/*'
+  --lrp_load_model='PFNet7_gen_ntrain_1_nepochs_10_batch_size_4_lr_0.0001_alpha_0.0002_both__nn1_nn3'
   --lrp_load_epoch=9

From d26f264cfdb4a2dc1aa5e5eadcc50ca623a95963 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Wed, 25 Aug 2021 11:41:15 -0700
Subject: [PATCH 111/157] oops

---
 scripts/local_test_delphes_pytorch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index b99505be8..134cd9fcd 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -57,6 +57,6 @@ echo Begining the LRP machinery..
 python3 lrp_pipeline.py \
   --n_test=1 --batch_size=4 \
   --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
-  --lrp_outpath='../test_tmp_delphes/data/experiments/' \
+  --lrp_outpath='../test_tmp_delphes/experiments/' \
   --lrp_load_model='PFNet7_gen_ntrain_1_nepochs_10_batch_size_4_lr_0.0001_alpha_0.0002_both__nn1_nn3'
   --lrp_load_epoch=9

From 2d83dfa5506bad3da410a206cc164d3d5952c2d3 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 26 Aug 2021 12:47:18 +0200
Subject: [PATCH 112/157] feat: Add logging of GPU power

In addition, the nvidia-smi plot script is modified
to also plot the power.
---
 mlpf/flatiron/start-head.sh   |  2 +-
 mlpf/flatiron/start-worker.sh |  2 +-
 mlpf/pipeline.py              |  2 +-
 scripts/plot_nvidiasmi_csv.py | 39 +++++++++++++++++++----------------
 4 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/mlpf/flatiron/start-head.sh b/mlpf/flatiron/start-head.sh
index 0af43132c..6849d1d08 100755
--- a/mlpf/flatiron/start-head.sh
+++ b/mlpf/flatiron/start-head.sh
@@ -6,6 +6,6 @@ export LANG=C.UTF-8
 echo "starting ray head node"
 # Launch the head node
 mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2"
-nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" &
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" &
 ray start --head --node-ip-address=$1 --port=6379
 sleep infinity
diff --git a/mlpf/flatiron/start-worker.sh b/mlpf/flatiron/start-worker.sh
index f90bdb2c0..14c8951b4 100755
--- a/mlpf/flatiron/start-worker.sh
+++ b/mlpf/flatiron/start-worker.sh
@@ -5,6 +5,6 @@ export LANG=C.UTF-8
 
 echo "starting ray worker node"
 mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2"
-nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" &
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" &
 ray start --address $1
 sleep infinity
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 7a158a8b9..c0b47b0b4 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -98,7 +98,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if "CPU" not in strategy.extended.worker_devices[0]:
-        nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
+        nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
         p = subprocess.Popen(shlex.split(nvidia_smi_call))
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
diff --git a/scripts/plot_nvidiasmi_csv.py b/scripts/plot_nvidiasmi_csv.py
index d539ffb75..3562b1796 100644
--- a/scripts/plot_nvidiasmi_csv.py
+++ b/scripts/plot_nvidiasmi_csv.py
@@ -13,15 +13,6 @@ def parse_args():
     return args
 
 
-def plot_gpu_util(df, cuda_device):
-    plt.figure(figsize=(12,9))
-    plt.plot(df["time"], df["GPU{}_util".format(cuda_device)], alpha=0.8)
-    plt.xlabel("Time [s]")
-    plt.ylabel("GPU utilization [%]")
-    plt.title("GPU{}".format(cuda_device))
-    plt.grid(alpha=0.3)
-
-
 def plot_gpu_util(df, cuda_device, ax):
     ax.plot(df["time"], df["GPU{}_util".format(cuda_device)], alpha=0.8)
     ax.set_xlabel("Time [s]")
@@ -29,6 +20,23 @@ def plot_gpu_util(df, cuda_device, ax):
     ax.set_title("GPU{}".format(cuda_device))
     ax.grid(alpha=0.3)
 
+def plot_gpu_power(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_power".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("Power consumption [W]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+def plot_dfs(dfs, plot_func, suffix):
+    fig, axs = plt.subplots(2, 2, figsize=(12,9), tight_layout=True)
+    for ax in axs.flat:
+        ax.label_outer()
+
+    for cuda_device, (df, ax) in enumerate(zip(dfs, axs.flat)):
+        plot_func(df, cuda_device, ax)
+    plt.suptitle("{}".format(file.stem))
+    plt.savefig(args.dir + "/{}_{}.jpg".format(file.stem, suffix))
+
 
 if __name__ == "__main__":
     args = parse_args()
@@ -43,14 +51,9 @@ def plot_gpu_util(df, cuda_device, ax):
         for ii, gpu in enumerate(np.unique(df[" pci.bus_id"].values)):
             dfs.append(pd.DataFrame({
                 "GPU{}_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.gpu [%]"].map(lambda x: int(x.split(" ")[1])),
+                "GPU{}_power".format(ii): df[df[" pci.bus_id"] == gpu][" power.draw [W]"].map(lambda x: float(x.split(" ")[1])),
                 "time": df[df[" pci.bus_id"] == gpu]["timestamp"].map(lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t),
             }).dropna())
-    
-        fig, axs = plt.subplots(2, 2, figsize=(12,9), tight_layout=True)
-        for ax in axs.flat:
-            ax.label_outer()
-
-        for cuda_device, (df, ax) in enumerate(zip(dfs, axs.flat)):
-            plot_gpu_util(df, cuda_device, ax)
-        plt.suptitle("{}".format(file.stem))
-        plt.savefig(args.dir + "/{}_gpu_util.jpg".format(file.stem))
\ No newline at end of file
+
+        plot_dfs(dfs, plot_gpu_util, "gpu_util")
+        plot_dfs(dfs, plot_gpu_power, "gpu_power")

From c443137987588d2f6c541b6142ecb66ff7c8b411 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 26 Aug 2021 14:46:30 +0300
Subject: [PATCH 113/157] fix energy regression

---
 mlpf/tfmodel/model.py       | 128 +++++++++++++++++++++++++-----------
 mlpf/tfmodel/model_setup.py |   6 +-
 mlpf/tfmodel/utils.py       |  16 +++--
 parameters/cms-dev.yaml     |  16 ++---
 parameters/cms.yaml         |  12 ++--
 parameters/delphes.yaml     |  12 ++--
 6 files changed, 123 insertions(+), 67 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 6c0153b70..02b908c70 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -139,16 +139,27 @@ def __init__(self, num_input_classes):
     @tf.function
     def call(self, X):
 
+        log_energy = tf.expand_dims(tf.math.log(X[:, :, 4]+1.0), axis=-1)
+
         #X[:, :, 0] - categorical index of the element type
         Xid = tf.cast(tf.one_hot(tf.cast(X[:, :, 0], tf.int32), self.num_input_classes), dtype=X.dtype)
         #Xpt = tf.expand_dims(tf.math.log1p(X[:, :, 1]), axis=-1)
         Xpt = tf.expand_dims(tf.math.log(X[:, :, 1] + 1.0), axis=-1)
+
+        Xpt_0p5 = tf.math.sqrt(Xpt)
+        Xpt_2 = tf.math.pow(Xpt, 2)
+
         Xeta1 = tf.expand_dims(tf.sinh(X[:, :, 2]), axis=-1)
         Xeta2 = tf.expand_dims(tf.cosh(X[:, :, 2]), axis=-1)
+        Xabs_eta = tf.expand_dims(tf.math.abs(X[:, :, 2]), axis=-1)
         Xphi1 = tf.expand_dims(tf.sin(X[:, :, 3]), axis=-1)
         Xphi2 = tf.expand_dims(tf.cos(X[:, :, 3]), axis=-1)
+
         #Xe = tf.expand_dims(tf.math.log1p(X[:, :, 4]), axis=-1)
-        Xe = tf.expand_dims(tf.math.log(X[:, :, 4]+1.0), axis=-1)
+        Xe = log_energy
+        Xe_0p5 = tf.math.sqrt(log_energy)
+        Xe_2 = tf.math.pow(log_energy, 2)
+
         Xlayer = tf.expand_dims(X[:, :, 5]*10.0, axis=-1)
         Xdepth = tf.expand_dims(X[:, :, 6]*10.0, axis=-1)
 
@@ -158,11 +169,15 @@ def call(self, X):
         Xphi_hcal2 = tf.expand_dims(tf.cos(X[:, :, 12]), axis=-1)
 
         return tf.concat([
-            Xid, Xpt,
+            Xid,
+            Xpt, Xpt_0p5, Xpt_2,
             Xeta1, Xeta2,
+            Xabs_eta,
             Xphi1, Xphi2,
-            Xe, Xlayer, Xdepth,
-            Xphi_ecal1, Xphi_ecal2, Xphi_hcal1, Xphi_hcal2,
+            Xe, Xe_0p5, Xe_2,
+            Xlayer, Xdepth,
+            Xphi_ecal1, Xphi_ecal2,
+            Xphi_hcal1, Xphi_hcal2,
             X], axis=-1
         )
 
@@ -374,7 +389,6 @@ def call(self, x_msg, x_node, msk):
 
         return bins_split, x_features_binned, dm, msk_f_binned
 
-
 class OutputDecoding(tf.keras.Model):
     def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout, **kwargs):
         super(OutputDecoding, self).__init__(**kwargs)
@@ -403,26 +417,26 @@ def __init__(self, activation, hidden_dim, regression_use_classification, num_ou
         )
         
         self.ffn_pt = point_wise_feed_forward_network(
-            4, hidden_dim, "ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            2, hidden_dim, "ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
             dropout=dropout
         )
 
         self.ffn_eta = point_wise_feed_forward_network(
             2, hidden_dim, "ffn_eta",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
             dropout=dropout
         )
 
         self.ffn_phi = point_wise_feed_forward_network(
             4, hidden_dim, "ffn_phi",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=True,
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
             dropout=dropout
         )
 
         self.ffn_energy = point_wise_feed_forward_network(
-            4, hidden_dim*4, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=4, activation=activation, dim_decrease=True,
+            2, hidden_dim*4, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
             dropout=dropout
         )
 
@@ -443,15 +457,16 @@ def call(self, args, training=False):
         #orig_pt = X_input[:, :, 1:2]
         orig_eta = X_input[:, :, 2:3]
 
-        #FIXME: schema 
+        #FIXME: better schema propagation 
+        #skip connection from raw input values
         if self.schema == "cms":
-            orig_sin_phi = tf.math.sin(X_input[:, :, 3:4])
-            orig_cos_phi = tf.math.cos(X_input[:, :, 3:4])
-            orig_energy = X_input[:, :, 4:5]
+            orig_sin_phi = tf.math.sin(X_input[:, :, 3:4])*msk_input
+            orig_cos_phi = tf.math.cos(X_input[:, :, 3:4])*msk_input
+            orig_log_energy = tf.math.log(X_input[:, :, 4:5] + 1.0)*msk_input
         elif self.schema == "delphes":
-            orig_sin_phi = X_input[:, :, 3:4]
-            orig_cos_phi = X_input[:, :, 4:5]
-            orig_energy = X_input[:, :, 5:6]
+            orig_sin_phi = X_input[:, :, 3:4]*msk_input
+            orig_cos_phi = X_input[:, :, 4:5]*msk_input
+            orig_log_energy = tf.math.log(X_input[:, :, 5:6] + 1.0)*msk_input
 
         if self.regression_use_classification:
             X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_softmax)], axis=-1)
@@ -460,34 +475,41 @@ def call(self, args, training=False):
         pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input
 
         eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
-        pred_eta = orig_eta*eta_sigmoid + (1.0 - eta_sigmoid)*pred_eta_corr[:, :, 1:2]
+        pred_eta = orig_eta + eta_sigmoid*pred_eta_corr[:, :, 1:2]
 
         sin_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
         cos_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
-        pred_sin_phi = orig_sin_phi*sin_phi_sigmoid + (1.0 - sin_phi_sigmoid)*pred_phi_corr[:, :, 1:2]
-        pred_cos_phi = orig_cos_phi*cos_phi_sigmoid + (1.0 - cos_phi_sigmoid)*pred_phi_corr[:, :, 3:4]
+        pred_sin_phi = orig_sin_phi + sin_phi_sigmoid*pred_phi_corr[:, :, 1:2]
+        pred_cos_phi = orig_cos_phi + cos_phi_sigmoid*pred_phi_corr[:, :, 3:4]
 
         X_encoded = tf.concat([X_encoded, tf.stop_gradient(pred_eta)], axis=-1)
         pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
         pred_pt_corr = self.ffn_pt(X_encoded, training)*msk_input
 
-        energy_sigmoid1 = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-        energy_sigmoid2 = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 1:2])
-        pred_energy = orig_energy*(1.0 + energy_sigmoid1*pred_energy_corr[:, :, 2:3]) + energy_sigmoid2*pred_energy_corr[:, :, 3:4]
-        
-        orig_pt = tf.stop_gradient(pred_energy - tf.math.log(tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8))))
-        pt_sigmoid1 = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        pt_sigmoid2 = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 1:2])
-        pred_pt = orig_pt*(1.0 + pt_sigmoid1*pred_pt_corr[:, :, 2:3]) + pt_sigmoid2*pred_pt_corr[:, :, 3:4]
+        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+
+        #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
+        #pred_log_energy = orig_log_energy*energy_sigmoid + (1.0 - energy_sigmoid)*pred_energy_corr[:, :, 1:2]
+        pred_log_energy = orig_log_energy+ energy_sigmoid*pred_energy_corr[:, :, 1:2]
+        pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
+
+        #compute pt=E/cosh(eta)
+        orig_pt = tf.stop_gradient(pred_energy/tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
+        orig_log_pt = tf.math.log(orig_pt + 1.0)
+
+        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+        pred_log_pt = orig_log_pt + pt_sigmoid*pred_pt_corr[:, :, 1:2]
+
+        msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_softmax, axis=-1)!=0, tf.float32), axis=-1)
 
         ret = {
             "cls": out_id_softmax,
-            "charge": out_charge*msk_input,
-            "pt": pred_pt*msk_input,
-            "eta": pred_eta*msk_input,
-            "sin_phi": pred_sin_phi*msk_input,
-            "cos_phi": pred_cos_phi*msk_input,
-            "energy": pred_energy*msk_input,
+            "charge": out_charge*msk_input*msk_output,
+            "pt": pred_log_pt*msk_input*msk_output,
+            "eta": pred_eta*msk_input*msk_output,
+            "sin_phi": pred_sin_phi*msk_input*msk_output,
+            "cos_phi": pred_cos_phi*msk_input*msk_output,
+            "energy": pred_log_energy*msk_input*msk_output,
         }
 
         return ret
@@ -539,10 +561,10 @@ def __init__(self, *args, **kwargs):
 
         super(CombinedGraphLayer, self).__init__(*args, **kwargs)
 
-    def call(self, x, msk, training):
+    def call(self, x, msk, training=False):
 
         if self.do_layernorm:
-            x = self.layernorm(x)
+            x = self.layernorm(x, training=training)
 
         #compute node features for graph building
         x_dist = self.ffn_dist(x)
@@ -664,6 +686,38 @@ def set_trainable_named(self, layer_names):
 
         self.output_dec.set_trainable_named(layer_names)
 
+    ##for eager mode debugging
+    # def train_step(self, data):
+    #     # Unpack the data. Its structure depends on your model and
+    #     # on what you pass to `fit()`.
+    #     x, y, sample_weights = data
+
+    #     with tf.GradientTape() as tape:
+    #         y_pred = self(x, training=True)  # Forward pass
+    #         # Compute the loss value
+    #         # (the loss function is configured in `compile()`)
+    #         loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+    #         import pdb;pdb.set_trace()
+
+    #     ya = {k: v.numpy() for k, v in y.items()}
+    #     yb = {k: v.numpy() for k, v in y_pred.items()}
+    #     sw = {k: v.numpy() for k, v in sample_weights.items()}
+
+    #     np.savez("ytrue.npz", **ya)
+    #     np.savez("ypred.npz", **yb)
+    #     np.savez("x.npz", x=x)
+    #     np.savez("sample_weights.npz", **sample_weights)
+
+    #     # Compute gradients
+    #     trainable_vars = self.trainable_variables
+    #     gradients = tape.gradient(loss, trainable_vars)
+    #     # Update weights
+    #     self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+    #     # Update metrics (includes the metric that tracks the loss)
+    #     self.compiled_metrics.update_state(y, y_pred)
+    #     # Return a dict mapping metric names to current value
+    #     return {m.name: m.result() for m in self.metrics}
+
 class DummyNet(tf.keras.Model):
     def __init__(self,
                 num_input_classes=8,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 0d39e0009..469be3d59 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -92,11 +92,11 @@ def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_cla
         }
 
         self.reg_bins = {
-            "pt": np.linspace(-4, 8, 100),
-            "eta": np.linspace(-8, 8, 100),
+            "pt": np.linspace(0, 5, 100),
+            "eta": np.linspace(-6, 6, 100),
             "sin_phi": np.linspace(-1,1,100),
             "cos_phi": np.linspace(-1,1,100),
-            "energy": np.linspace(-1, 10,100),
+            "energy": np.linspace(0, 7,100),
         }
 
     def plot_cm(self, epoch, outpath, ypred_id, msk):
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index b3733e537..1909f07b9 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -153,7 +153,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
@@ -178,16 +178,18 @@ def weight_func(X,y,w):
 
 def targets_multi_output(num_output_classes):
     def func(X, y, w):
+
+        msk = tf.expand_dims(tf.cast(y[:, :, 0]!=0, tf.float32), axis=-1)
         return (
             X,
             {
                 "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),
-                "charge": y[:, :, 1:2],
-                "pt": tf.math.log(y[:, :, 2:3] + 1.0),
-                "eta": y[:, :, 3:4],
-                "sin_phi": y[:, :, 4:5],
-                "cos_phi": y[:, :, 5:6],
-                "energy": tf.math.log(y[:, :, 6:7] + 1.0),
+                "charge": y[:, :, 1:2]*msk,
+                "pt": tf.math.log(y[:, :, 2:3] + 1.0)*msk,
+                "eta": y[:, :, 3:4]*msk,
+                "sin_phi": y[:, :, 4:5]*msk,
+                "cos_phi": y[:, :, 5:6]*msk,
+                "energy": tf.math.log(y[:, :, 6:7] + 1.0)*msk,
             },
             w,
         )
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index d3cb162ab..9050efa2d 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -25,11 +25,11 @@ dataset:
   num_momentum_outputs: 5
   classification_loss_coef: 1.0
   charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
+  pt_loss_coef: 100.0
   eta_loss_coef: 100.0
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
-  energy_loss_coef: 1.0
+  energy_loss_coef: 100.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
@@ -57,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-5
+  lr: 1e-3
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
@@ -80,25 +80,25 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
-  activation: elu
+  activation: gelu
   layernorm: no
   hidden_dim: 256
   bin_size: 32
-  distance_dim: 16
+  distance_dim: 8
   dropout: 0.0
   graph_kernel:
     type: NodePairTrainableKernel
     output_dim: 8
     hidden_dim: 32
     num_layers: 2
-    activation: elu
-  num_graph_layers: 6
+    activation: gelu
+  num_graph_layers: 3
   node_message:
     type: NodeMessageLearnable
     output_dim: 256
     hidden_dim: 128
     num_layers: 2
-    activation: elu
+    activation: gelu
     aggregation_direction: dst
   num_node_messages: 1
   skip_connection: yes
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 3d8689110..2fdf498ff 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -25,11 +25,11 @@ dataset:
   num_momentum_outputs: 5
   classification_loss_coef: 1.0
   charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
+  pt_loss_coef: 100.0
   eta_loss_coef: 100.0
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
-  energy_loss_coef: 1.0
+  energy_loss_coef: 100.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
@@ -57,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-5
+  lr: 1e-3
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
@@ -80,8 +80,8 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
-  activation: elu
-  layernorm: no
+  activation: gelu
+  layernorm: yes
   hidden_dim: 256
   bin_size: 320
   distance_dim: 128
@@ -90,7 +90,7 @@ parameters:
     type: NodePairGaussianKernel
     dist_mult: 0.1
     clip_value_low: 0.0
-  num_graph_layers: 6
+  num_graph_layers: 5
   node_message:
     type: GHConvDense
     output_dim: 256
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index a79800c95..3bf60990b 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -13,21 +13,21 @@ dataset:
   padded_num_elem_size: 6400
   classification_loss_coef: 1.0
   charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
+  pt_loss_coef: 100.0
   eta_loss_coef: 100.0
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
-  energy_loss_coef: 1.0
+  energy_loss_coef: 100.0
   raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
   processed_path: data/pythia8_ttbar/tfr/*.tfrecords
   num_files_per_chunk: 5
   validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
   energy_loss:
     type: Huber
-    delta: 0.1
+    delta: 1.0
   pt_loss:
     type: Huber
-    delta: 0.1
+    delta: 1.0
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -46,7 +46,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 5
+  batch_size: 4
   num_events_train: 45000
   num_events_test: 5000
   num_epochs: 10
@@ -78,7 +78,7 @@ parameters:
     type: NodePairGaussianKernel
     dist_mult: 0.1
     clip_value_low: 0.0
-  num_graph_layers: 6
+  num_graph_layers: 5
   node_message:
     type: GHConvDense
     output_dim: 256

From e809965d564b35cb8c6f094a85388c52ba222c3c Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 26 Aug 2021 15:09:00 +0200
Subject: [PATCH 114/157] feat: Add plotting of GPU memory usage from
 nvidia-smi log

---
 .github/workflows/test.yml    |  4 +--
 scripts/plot_nvidiasmi_csv.py | 54 ++++++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6c273c3e8..f32fe0c57 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,7 +18,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm 'ray[default]' 'ray[tune]'
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_tf.sh
 
@@ -31,7 +31,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm 'ray[default]' 'ray[tune]'
       - name: Run CMS TF model
         run: ./scripts/local_test_cms_tf.sh
 
diff --git a/scripts/plot_nvidiasmi_csv.py b/scripts/plot_nvidiasmi_csv.py
index 3562b1796..ae48633d0 100644
--- a/scripts/plot_nvidiasmi_csv.py
+++ b/scripts/plot_nvidiasmi_csv.py
@@ -5,10 +5,14 @@
 from datetime import datetime
 import time
 
+
 def parse_args():
     import argparse
+
     parser = argparse.ArgumentParser()
-    parser.add_argument("-d", "--dir", type=str, default="parameters/delphes-gnn-skipconn.yaml", help="dir containing csv files")
+    parser.add_argument(
+        "-d", "--dir", type=str, default="parameters/delphes-gnn-skipconn.yaml", help="dir containing csv files"
+    )
     args = parser.parse_args()
     return args
 
@@ -20,6 +24,7 @@ def plot_gpu_util(df, cuda_device, ax):
     ax.set_title("GPU{}".format(cuda_device))
     ax.grid(alpha=0.3)
 
+
 def plot_gpu_power(df, cuda_device, ax):
     ax.plot(df["time"], df["GPU{}_power".format(cuda_device)], alpha=0.8)
     ax.set_xlabel("Time [s]")
@@ -27,8 +32,25 @@ def plot_gpu_power(df, cuda_device, ax):
     ax.set_title("GPU{}".format(cuda_device))
     ax.grid(alpha=0.3)
 
+
+def plot_gpu_mem_util(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_mem_util".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("GPU memory utilization [%]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
+def plot_gpu_mem_used(df, cuda_device, ax):
+    ax.plot(df["time"], df["GPU{}_mem_used".format(cuda_device)], alpha=0.8)
+    ax.set_xlabel("Time [s]")
+    ax.set_ylabel("Used GPU memory [MiB]")
+    ax.set_title("GPU{}".format(cuda_device))
+    ax.grid(alpha=0.3)
+
+
 def plot_dfs(dfs, plot_func, suffix):
-    fig, axs = plt.subplots(2, 2, figsize=(12,9), tight_layout=True)
+    fig, axs = plt.subplots(2, 2, figsize=(12, 9), tight_layout=True)
     for ax in axs.flat:
         ax.label_outer()
 
@@ -49,11 +71,29 @@ def plot_dfs(dfs, plot_func, suffix):
         start_t = datetime.strptime(start_time, "%Y/%m/%d %H:%M:%S.%f").timestamp()
         dfs = []
         for ii, gpu in enumerate(np.unique(df[" pci.bus_id"].values)):
-            dfs.append(pd.DataFrame({
-                "GPU{}_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.gpu [%]"].map(lambda x: int(x.split(" ")[1])),
-                "GPU{}_power".format(ii): df[df[" pci.bus_id"] == gpu][" power.draw [W]"].map(lambda x: float(x.split(" ")[1])),
-                "time": df[df[" pci.bus_id"] == gpu]["timestamp"].map(lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t),
-            }).dropna())
+            dfs.append(
+                pd.DataFrame(
+                    {
+                        "GPU{}_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.gpu [%]"].map(
+                            lambda x: int(x.split(" ")[1])
+                        ),
+                        "GPU{}_power".format(ii): df[df[" pci.bus_id"] == gpu][" power.draw [W]"].map(
+                            lambda x: float(x.split(" ")[1])
+                        ),
+                        "GPU{}_mem_util".format(ii): df[df[" pci.bus_id"] == gpu][" utilization.memory [%]"].map(
+                            lambda x: int(x.split(" ")[1])
+                        ),
+                        "GPU{}_mem_used".format(ii): df[df[" pci.bus_id"] == gpu][" memory.used [MiB]"].map(
+                            lambda x: int(x.split(" ")[1])
+                        ),
+                        "time": df[df[" pci.bus_id"] == gpu]["timestamp"].map(
+                            lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S.%f").timestamp() - start_t
+                        ),
+                    }
+                ).dropna()
+            )
 
         plot_dfs(dfs, plot_gpu_util, "gpu_util")
         plot_dfs(dfs, plot_gpu_power, "gpu_power")
+        plot_dfs(dfs, plot_gpu_mem_used, "gpu_mem_used")
+        plot_dfs(dfs, plot_gpu_mem_util, "gpu_mem_util")

From 3b78128775e003bf1ec385580cabb242f561d30f Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 26 Aug 2021 20:21:00 +0300
Subject: [PATCH 115/157] layernorm

---
 parameters/cms-dev.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 9050efa2d..1415bc841 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -81,7 +81,7 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   activation: gelu
-  layernorm: no
+  layernorm: yes
   hidden_dim: 256
   bin_size: 32
   distance_dim: 8

From fe6b328e33bcc7fb392d67274747bfb7f53a1f94 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Fri, 27 Aug 2021 08:47:03 +0300
Subject: [PATCH 116/157] added gen training

---
 mlpf/tallinn/cms-gen.sh |  10 ++++
 parameters/cms-gen.yaml | 111 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100755 mlpf/tallinn/cms-gen.sh
 create mode 100644 parameters/cms-gen.yaml

diff --git a/mlpf/tallinn/cms-gen.sh b/mlpf/tallinn/cms-gen.sh
new file mode 100755
index 000000000..f119a0df1
--- /dev/null
+++ b/mlpf/tallinn/cms-gen.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#SBATCH -p gpu
+#SBATCH --gpus 1
+#SBATCH --mem-per-gpu=8G
+
+IMG=/home/software/singularity/base.simg:latest
+cd ~/particleflow
+
+#TF training
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/cms-gen.yaml
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
new file mode 100644
index 000000000..2147ff140
--- /dev/null
+++ b/parameters/cms-gen.yaml
@@ -0,0 +1,111 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: gen
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 1.0
+  pt_loss_coef: 100.0
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 100.0
+  cos_phi_loss_coef: 100.0
+  energy_loss_coef: 100.0
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+  energy_loss:
+    type: Huber
+    delta: 1.0
+  pt_loss:
+    type: Huber
+    delta: 1.0
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 4
+  num_events_train: 80000
+  num_events_test: 9000
+  num_epochs: 100
+  num_val_files: 10
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  activation: gelu
+  layernorm: yes
+  hidden_dim: 256
+  bin_size: 320
+  distance_dim: 128
+  dropout: 0.0
+  graph_kernel:
+    type: NodePairGaussianKernel
+    dist_mult: 0.1
+    clip_value_low: 0.0
+  num_graph_layers: 5
+  node_message:
+    type: GHConvDense
+    output_dim: 256
+    activation: gelu
+    normalize_degrees: yes
+  num_node_messages: 1
+  skip_connection: yes
+  regression_use_classification: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes

From d301c5b8e48e29d410eaa4aac573307bb86b7b95 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 10:24:14 +0300
Subject: [PATCH 117/157] more configurable options, use main pars for pipeline
 test

---
 mlpf/pipeline.py                       |   5 +-
 mlpf/tfmodel/model.py                  | 132 ++++++++++++++++++-------
 mlpf/tfmodel/model_setup.py            |   2 +-
 parameters/cms-dev.yaml                |  31 ++++++
 parameters/cms-gen.yaml                |  34 ++++++-
 parameters/cms.yaml                    |  31 ++++++
 parameters/delphes.yaml                |  31 ++++++
 scripts/local_test_cms_pipeline.sh     |   8 +-
 scripts/local_test_delphes_pipeline.sh |   6 +-
 9 files changed, 233 insertions(+), 47 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index a8d259090..32d7546e1 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -97,11 +97,12 @@ def data(config, customize):
 @click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
 @click.option("--ntrain", default=None, help="override the number of training events", type=int)
 @click.option("--ntest", default=None, help="override the number of testing events", type=int)
+@click.option("--nepochs", default=None, help="override the number of training epochs", type=int)
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
 @click.option("--plot-freq", default=1, help="Plot detailed validation every N epochs", type=int)
 @click.option("--customize", help="customization function", type=str, default=None)
-def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq, customize):
+def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq, customize):
 
     try:
         from comet_ml import Experiment
@@ -123,6 +124,8 @@ def train(config, weights, ntrain, ntest, recreate, prefix, plot_freq, customize
     config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
         config, ntrain, ntest, weights
     )
+    if nepochs:
+        n_epochs = nepochs
 
     if customize:
         prefix += customize + "_"
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 02b908c70..ad2ebc5b9 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -390,71 +390,116 @@ def call(self, x_msg, x_node, msk):
         return bins_split, x_features_binned, dm, msk_f_binned
 
 class OutputDecoding(tf.keras.Model):
-    def __init__(self, activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout, **kwargs):
+    def __init__(self,
+        activation="elu",
+        regression_use_classification=True,
+        num_output_classes=8,
+        schema="cms",
+        dropout=0.0,
+
+        pt_skip_gate=True,
+        eta_skip_gate=True,
+        phi_skip_gate=True,
+        energy_skip_gate=True,
+
+        id_dim_decrease=True,
+        charge_dim_decrease=True,
+        pt_dim_decrease=False,
+        eta_dim_decrease=False,
+        phi_dim_decrease=False,
+        energy_dim_decrease=False,
+
+        id_hidden_dim=128,
+        charge_hidden_dim=128,
+        pt_hidden_dim=128,
+        eta_hidden_dim=128,
+        phi_hidden_dim=128,
+        energy_hidden_dim=128,
+
+        id_num_layers=4,
+        charge_num_layers=2,
+        pt_num_layers=3,
+        eta_num_layers=3,
+        phi_num_layers=3,
+        energy_num_layers=3,
+
+        layernorm=False,
+
+        **kwargs):
         super(OutputDecoding, self).__init__(**kwargs)
 
         self.regression_use_classification = regression_use_classification
         self.schema = schema
         self.dropout = dropout
 
+        self.pt_skip_gate = pt_skip_gate
+        self.eta_skip_gate = eta_skip_gate
+        self.phi_skip_gate = phi_skip_gate
+        self.energy_skip_gate = energy_skip_gate
+
+        self.do_layernorm = layernorm
+        if self.do_layernorm:
+            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1)
+
         self.ffn_id = point_wise_feed_forward_network(
-            num_output_classes, hidden_dim*4,
+            num_output_classes, id_hidden_dim,
             "ffn_cls",
             dtype=tf.dtypes.float32,
-            num_layers=4,
+            num_layers=id_num_layers,
             activation=activation,
-            dim_decrease=True,
+            dim_decrease=id_dim_decrease,
             dropout=dropout
         )
         self.ffn_charge = point_wise_feed_forward_network(
-            1, hidden_dim,
+            1, charge_hidden_dim,
             "ffn_charge",
             dtype=tf.dtypes.float32,
-            num_layers=2,
+            num_layers=charge_num_layers,
             activation=activation,
-            dim_decrease=True,
+            dim_decrease=charge_dim_decrease,
             dropout=dropout
         )
         
         self.ffn_pt = point_wise_feed_forward_network(
-            2, hidden_dim, "ffn_pt",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            2, pt_hidden_dim, "ffn_pt",
+            dtype=tf.dtypes.float32, num_layers=pt_num_layers, activation=activation, dim_decrease=pt_dim_decrease,
             dropout=dropout
         )
 
         self.ffn_eta = point_wise_feed_forward_network(
-            2, hidden_dim, "ffn_eta",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            2, eta_hidden_dim, "ffn_eta",
+            dtype=tf.dtypes.float32, num_layers=eta_num_layers, activation=activation, dim_decrease=eta_dim_decrease,
             dropout=dropout
         )
 
         self.ffn_phi = point_wise_feed_forward_network(
-            4, hidden_dim, "ffn_phi",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            4, phi_hidden_dim, "ffn_phi",
+            dtype=tf.dtypes.float32, num_layers=phi_num_layers, activation=activation, dim_decrease=phi_dim_decrease,
             dropout=dropout
         )
 
         self.ffn_energy = point_wise_feed_forward_network(
-            2, hidden_dim*4, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=3, activation=activation, dim_decrease=False,
+            2, energy_hidden_dim, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
             dropout=dropout
         )
 
     """
-    X_input: (n_batch, n_elements, n_input_features)
-    X_encoded_id: (n_batch, n_elements, n_encoded_features)
-    X_encoded_reg: (n_batch, n_elements, n_encoded_features)
-    msk_input: (n_batch, n_elements) boolean mask
+    X_input: (n_batch, n_elements, n_input_features) raw node input features
+    X_encoded: (n_batch, n_elements, n_encoded_features) encoded/transformed node features
+    msk_input: (n_batch, n_elements) boolean mask of active nodes
     """
     def call(self, args, training=False):
 
         X_input, X_encoded, msk_input = args
 
+        if self.do_layernorm:
+            X_encoded = self.layernorm(X_encoded)
+
         out_id_logits = self.ffn_id(X_encoded, training)*msk_input
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
         out_charge = self.ffn_charge(X_encoded, training)*msk_input
 
-        #orig_pt = X_input[:, :, 1:2]
         orig_eta = X_input[:, :, 2:3]
 
         #FIXME: better schema propagation 
@@ -474,31 +519,44 @@ def call(self, args, training=False):
         pred_eta_corr = self.ffn_eta(X_encoded, training)*msk_input
         pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input
 
-        eta_sigmoid = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
-        pred_eta = orig_eta + eta_sigmoid*pred_eta_corr[:, :, 1:2]
+        if self.eta_skip_gate:
+            eta_gate = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
+        else:
+            eta_gate = 1.0
+        pred_eta = orig_eta + eta_gate*pred_eta_corr[:, :, 1:2]
+
+        if self.phi_skip_gate:
+            sin_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
+            cos_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
+        else:
+            sin_phi_gate = 1.0
+            cos_phi_gate = 1.0
 
-        sin_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
-        cos_phi_sigmoid = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
-        pred_sin_phi = orig_sin_phi + sin_phi_sigmoid*pred_phi_corr[:, :, 1:2]
-        pred_cos_phi = orig_cos_phi + cos_phi_sigmoid*pred_phi_corr[:, :, 3:4]
+        pred_sin_phi = orig_sin_phi + sin_phi_gate*pred_phi_corr[:, :, 1:2]
+        pred_cos_phi = orig_cos_phi + cos_phi_gate*pred_phi_corr[:, :, 3:4]
 
         X_encoded = tf.concat([X_encoded, tf.stop_gradient(pred_eta)], axis=-1)
         pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
         pred_pt_corr = self.ffn_pt(X_encoded, training)*msk_input
 
-        energy_sigmoid = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        if self.energy_skip_gate:
+            energy_gate = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+        else:
+            energy_gate = 1.0
 
         #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
-        #pred_log_energy = orig_log_energy*energy_sigmoid + (1.0 - energy_sigmoid)*pred_energy_corr[:, :, 1:2]
-        pred_log_energy = orig_log_energy+ energy_sigmoid*pred_energy_corr[:, :, 1:2]
+        pred_log_energy = orig_log_energy + energy_gate*pred_energy_corr[:, :, 1:2]
         pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
 
         #compute pt=E/cosh(eta)
         orig_pt = tf.stop_gradient(pred_energy/tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
         orig_log_pt = tf.math.log(orig_pt + 1.0)
 
-        pt_sigmoid = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-        pred_log_pt = orig_log_pt + pt_sigmoid*pred_pt_corr[:, :, 1:2]
+        if self.pt_skip_gate:
+            pt_gate = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+        else:
+            pt_gate = 1.0
+        pred_log_pt = orig_log_pt + pt_gate*pred_pt_corr[:, :, 1:2]
 
         msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_softmax, axis=-1)!=0, tf.float32), axis=-1)
 
@@ -600,8 +658,8 @@ def __init__(self,
             focal_loss_from_logits=False,
             graph_kernel={"type": "NodePairGaussianKernel"},
             skip_connection=True,
-            regression_use_classification=True,
             node_message={"type": "GHConvDense", "activation": "elu", "output_dim": 128, "normalize_degrees": True},
+            output_decoding={},
             debug=False,
             schema="cms"
         ):
@@ -611,7 +669,7 @@ def __init__(self,
         self.activation = activation
         self.focal_loss_from_logits = focal_loss_from_logits
         self.debug = debug
-        self.separate_graph_layers = False
+        self.do_layernorm = layernorm
 
         self.skip_connection = skip_connection
 
@@ -627,8 +685,8 @@ def __init__(self,
             "max_num_bins": max_num_bins,
             "bin_size": bin_size,
             "distance_dim": distance_dim,
-            "layernorm": layernorm,
-            "num_node_messages": num_node_messages,
+            "layernorm": self.do_layernorm,
+            "num_node_messages": self.num_node_messages,
             "dropout": dropout,
             "kernel": graph_kernel,
             "node_message": node_message,
@@ -637,7 +695,9 @@ def __init__(self,
 
         self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
 
-        self.output_dec = OutputDecoding(self.activation, hidden_dim, regression_use_classification, num_output_classes, schema, dropout)
+        output_decoding["schema"] = schema
+        output_decoding["num_output_classes"] = num_output_classes
+        self.output_dec = OutputDecoding(**output_decoding)
 
     def call(self, inputs, training=False):
         X = inputs
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 469be3d59..dce7d6376 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -401,8 +401,8 @@ def make_gnn_dense(config, dtype):
         "input_encoding",
         "graph_kernel",
         "skip_connection",
-        "regression_use_classification",
         "node_message",
+        "output_decoding",
         "debug"
     ]
 
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 1415bc841..ff0bbbe9d 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -101,6 +101,37 @@ parameters:
     activation: gelu
     aggregation_direction: dst
   num_node_messages: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: yes
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+    energy_skip_gate: no
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: no
+    eta_dim_decrease: no
+    phi_dim_decrease: no
+    energy_dim_decrease: no
+
+    id_hidden_dim: 1024
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 1024
+
+    id_num_layers: 4
+    charge_num_layers: 2
+    pt_num_layers: 3
+    eta_num_layers: 3
+    phi_num_layers: 3
+    energy_num_layers: 4
+    layernorm: yes
   skip_connection: yes
   regression_use_classification: yes
   debug: no
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index 2147ff140..8c72a5f1c 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -57,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 4
   num_events_train: 80000
   num_events_test: 9000
@@ -97,8 +97,38 @@ parameters:
     activation: gelu
     normalize_degrees: yes
   num_node_messages: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: yes
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+    energy_skip_gate: no
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: no
+    eta_dim_decrease: no
+    phi_dim_decrease: no
+    energy_dim_decrease: no
+
+    id_hidden_dim: 1024
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 1024
+
+    id_num_layers: 4
+    charge_num_layers: 2
+    pt_num_layers: 3
+    eta_num_layers: 3
+    phi_num_layers: 3
+    energy_num_layers: 4
+    layernorm: yes
   skip_connection: yes
-  regression_use_classification: yes
   debug: no
 
 timing:
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 2fdf498ff..368a72a4a 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -97,6 +97,37 @@ parameters:
     activation: gelu
     normalize_degrees: yes
   num_node_messages: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: yes
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+    energy_skip_gate: no
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: no
+    eta_dim_decrease: no
+    phi_dim_decrease: no
+    energy_dim_decrease: no
+
+    id_hidden_dim: 1024
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 1024
+
+    id_num_layers: 4
+    charge_num_layers: 2
+    pt_num_layers: 3
+    eta_num_layers: 3
+    phi_num_layers: 3
+    energy_num_layers: 4
+    layernorm: yes
   skip_connection: yes
   regression_use_classification: yes
   debug: no
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 3bf60990b..30601c7d4 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -84,6 +84,37 @@ parameters:
     output_dim: 256
     activation: elu
   num_node_messages: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: yes
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+    energy_skip_gate: no
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: no
+    eta_dim_decrease: no
+    phi_dim_decrease: no
+    energy_dim_decrease: no
+
+    id_hidden_dim: 1024
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 1024
+
+    id_num_layers: 4
+    charge_num_layers: 2
+    pt_num_layers: 3
+    eta_num_layers: 3
+    phi_num_layers: 3
+    energy_num_layers: 4
+    layernorm: yes
   skip_connection: yes
   regression_use_classification: yes
   debug: no
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 20ab50376..92527b24c 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -32,13 +32,13 @@ rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
-python3 mlpf/pipeline.py data -c parameters/test-cms.yaml
+python3 mlpf/pipeline.py data -c parameters/cms.yaml
 
 #Run a simple training on a few events
-python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
+python3 mlpf/pipeline.py train -c parameters/cms.yaml --nepochs 2 --ntrain 5 --ntest 5
 
 #Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/test-cms.yaml -t ./experiments/test-cms-*
+python3 mlpf/pipeline.py evaluate -c parameters/cms.yaml -t ./experiments/cms-*
 
 #Load the model
-python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
\ No newline at end of file
+python3 scripts/test_load_tfmodel.py ./experiments/cms-*/model_frozen/frozen_graph.pb
\ No newline at end of file
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 58c550a77..93f7f34ab 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -17,10 +17,10 @@ rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/pythia8_ttbar/tfr
-python3 mlpf/pipeline.py data -c parameters/test-delphes.yaml
+python3 mlpf/pipeline.py data -c parameters/delphes.yaml
 
 #Run a simple training on a few events
-python3 mlpf/pipeline.py train -c parameters/test-delphes.yaml -p test-delphes-
+python3 mlpf/pipeline.py train -c parameters/delphes.yaml --nepochs 2 --ntrain 5 --ntest 5
 
 #Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/test-delphes.yaml -t ./experiments/test-delphes-*
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t ./experiments/delphes-*

From 204c651034bc0617f54f048da33518144bf6909a Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 10:31:54 +0300
Subject: [PATCH 118/157] fix paths for pipeline

---
 parameters/cms.yaml                    | 4 ++--
 parameters/delphes.yaml                | 4 ++--
 scripts/local_test_cms_pipeline.sh     | 3 +--
 scripts/local_test_delphes_pipeline.sh | 4 ++--
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 368a72a4a..cd3a09e62 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -30,10 +30,10 @@ dataset:
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
   energy_loss_coef: 100.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
     type: Huber
     delta: 1.0
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 30601c7d4..90a713665 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -18,10 +18,10 @@ dataset:
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
   energy_loss_coef: 100.0
-  raw_path: data/pythia8_ttbar/raw/*.pkl.bz2
+  raw_path: data/pythia8_ttbar/raw/*.pkl*
   processed_path: data/pythia8_ttbar/tfr/*.tfrecords
   num_files_per_chunk: 5
-  validation_file_path: data/pythia8_qcd/val/*.pkl.bz2
+  validation_file_path: data/pythia8_qcd/val/*.pkl*
   energy_loss:
     type: Huber
     delta: 1.0
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 92527b24c..b0775446c 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -28,10 +28,9 @@ mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/val
 mv data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/pfntuple_3_0.pkl data/TTbar_14TeV_TuneCUETP8M1_cfi/val/
 
 mkdir -p experiments
-rm -Rf experiments/test-*
 
 #Run a simple training on a few events
-rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
+rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand
 python3 mlpf/pipeline.py data -c parameters/cms.yaml
 
 #Run a simple training on a few events
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 93f7f34ab..69aa4789e 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -1,19 +1,19 @@
 #!/bin/bash
 set -e
 
-mkdir -p data/pythia8_ttbar
+mkdir -p data/pythia8_ttbar/raw
 mkdir -p data/pythia8_ttbar/val
 cd data/pythia8_ttbar
 
 #download a test input file (you can also download everything from Zenodo at 10.5281/zenodo.4559324)
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
+mv tev14_pythia8_ttbar_0_0.pkl.bz2 raw/
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2
 mv tev14_pythia8_ttbar_0_1.pkl.bz2 val/
 
 cd ../..
 
 mkdir -p experiments
-rm -Rf experiments/test-*
 
 #Run a simple training on a few events
 rm -Rf data/pythia8_ttbar/tfr

From e9d5bc57b5c1f9397d47159177e591d3d0ea9541 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 10:45:45 +0300
Subject: [PATCH 119/157] fix paths once again

---
 parameters/cms-dev.yaml                | 4 ++--
 parameters/cms-gen.yaml                | 4 ++--
 scripts/local_test_cms_pipeline.sh     | 6 ++++--
 scripts/local_test_delphes_pipeline.sh | 6 +++---
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index ff0bbbe9d..dd46b1025 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -30,10 +30,10 @@ dataset:
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
   energy_loss_coef: 100.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
     type: Huber
     delta: 1.0
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index 8c72a5f1c..8aac1e304 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -30,10 +30,10 @@ dataset:
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
   energy_loss_coef: 100.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
   num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
     type: Huber
     delta: 1.0
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index b0775446c..eddfd8471 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -36,8 +36,10 @@ python3 mlpf/pipeline.py data -c parameters/cms.yaml
 #Run a simple training on a few events
 python3 mlpf/pipeline.py train -c parameters/cms.yaml --nepochs 2 --ntrain 5 --ntest 5
 
+ls ./experiments/cms-*/weights/
+
 #Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/cms.yaml -t ./experiments/cms-*
+python3 mlpf/pipeline.py evaluate -c parameters/cms.yaml -t ./experiments/cms_*
 
 #Load the model
-python3 scripts/test_load_tfmodel.py ./experiments/cms-*/model_frozen/frozen_graph.pb
\ No newline at end of file
+python3 scripts/test_load_tfmodel.py ./experiments/cms_*/model_frozen/frozen_graph.pb
\ No newline at end of file
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 69aa4789e..484233345 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -8,8 +8,8 @@ cd data/pythia8_ttbar
 #download a test input file (you can also download everything from Zenodo at 10.5281/zenodo.4559324)
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
 mv tev14_pythia8_ttbar_0_0.pkl.bz2 raw/
-wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2
-mv tev14_pythia8_ttbar_0_1.pkl.bz2 val/
+wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
+mv tev14_pythia8_qcd_10_0.pkl.bz2 val/
 
 cd ../..
 
@@ -23,4 +23,4 @@ python3 mlpf/pipeline.py data -c parameters/delphes.yaml
 python3 mlpf/pipeline.py train -c parameters/delphes.yaml --nepochs 2 --ntrain 5 --ntest 5
 
 #Generate the pred.npz file of predictions
-python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t ./experiments/delphes-*
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t ./experiments/delphes_*

From 722b41bc14c3d09e5b673834070b89af562cd28b Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 10:54:54 +0300
Subject: [PATCH 120/157] fix

---
 README_delphes.md                      | 4 ++--
 scripts/local_test_cms_pipeline.sh     | 2 +-
 scripts/local_test_delphes_pipeline.sh | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/README_delphes.md b/README_delphes.md
index 859237b9b..89fe9b6c6 100644
--- a/README_delphes.md
+++ b/README_delphes.md
@@ -17,8 +17,8 @@ python3 mlpf/pipeline.py data -c parameters/delphes.yaml
 CUDA_VISIBLE_DEVICES=0,1,2,3,4 python3 mlpf/pipeline.py train -c parameters/delphes.yaml
 
 #Run the validation to produce the predictions file
-python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes-* -v "data/pythia8_qcd/val/*.pkl.bz2" -e evaluate_qcd
-python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes-* -v "data/pythia8_ttbar/val/*.pkl.bz2" -e evaluate_ttbar
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_* -v "data/pythia8_qcd/val/*.pkl.bz2" -e evaluate_qcd
+python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t experiments/delphes_* -v "data/pythia8_ttbar/val/*.pkl.bz2" -e evaluate_ttbar
 ```
 
 ## Recipe for generation
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index eddfd8471..1e6d12877 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -36,7 +36,7 @@ python3 mlpf/pipeline.py data -c parameters/cms.yaml
 #Run a simple training on a few events
 python3 mlpf/pipeline.py train -c parameters/cms.yaml --nepochs 2 --ntrain 5 --ntest 5
 
-ls ./experiments/cms-*/weights/
+ls ./experiments/cms_*/weights/
 
 #Generate the pred.npz file of predictions
 python3 mlpf/pipeline.py evaluate -c parameters/cms.yaml -t ./experiments/cms_*
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 484233345..175a08ed4 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -2,14 +2,13 @@
 set -e
 
 mkdir -p data/pythia8_ttbar/raw
-mkdir -p data/pythia8_ttbar/val
-cd data/pythia8_ttbar
+mkdir -p data/pythia8_qcd/val
 
 #download a test input file (you can also download everything from Zenodo at 10.5281/zenodo.4559324)
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
-mv tev14_pythia8_ttbar_0_0.pkl.bz2 raw/
+mv tev14_pythia8_ttbar_0_0.pkl.bz2 data/pythia8_ttbar/raw/
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
-mv tev14_pythia8_qcd_10_0.pkl.bz2 val/
+mv tev14_pythia8_qcd_10_0.pkl.bz2 data/pythia8_qcd/val/
 
 cd ../..
 
@@ -22,5 +21,7 @@ python3 mlpf/pipeline.py data -c parameters/delphes.yaml
 #Run a simple training on a few events
 python3 mlpf/pipeline.py train -c parameters/delphes.yaml --nepochs 2 --ntrain 5 --ntest 5
 
+ls ./experiments/delphes_*/weights/
+
 #Generate the pred.npz file of predictions
 python3 mlpf/pipeline.py evaluate -c parameters/delphes.yaml -t ./experiments/delphes_*

From 21270d4c0f1779afa0d145e3a63906bdf543ab54 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 11:05:03 +0300
Subject: [PATCH 121/157] fix

---
 scripts/local_test_cms_pipeline.sh     | 3 +--
 scripts/local_test_delphes_pipeline.sh | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 1e6d12877..4614aef06 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -4,14 +4,13 @@ set -e
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi
 
 mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/root
-cd data/TTbar_14TeV_TuneCUETP8M1_cfi/root
 
 #Only CMS-internal use is permitted by CMS rules! Do not use these MC simulation files otherwise!
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_2.root
 wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_3.root
 
-cd ../../..
+mv *.root data/TTbar_14TeV_TuneCUETP8M1_cfi/root/
 
 #Create the ntuples
 rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 175a08ed4..6cf616fef 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -10,8 +10,6 @@ mv tev14_pythia8_ttbar_0_0.pkl.bz2 data/pythia8_ttbar/raw/
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
 mv tev14_pythia8_qcd_10_0.pkl.bz2 data/pythia8_qcd/val/
 
-cd ../..
-
 mkdir -p experiments
 
 #Run a simple training on a few events

From 5be47dd11e282c5d9be75420e767624542a14e99 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 14:48:47 +0300
Subject: [PATCH 122/157] up

---
 mlpf/pipeline.py             |  5 +++
 mlpf/tfmodel/model.py        | 82 ++++++++++++++++++++++++------------
 mlpf/tfmodel/utils.py        | 15 ++++++-
 notebooks/pfnet-debug.ipynb  | 48 ++++-----------------
 parameters/cms-dev.yaml      | 19 +++++----
 parameters/cms-gen.yaml      | 24 ++++++-----
 parameters/cms.yaml          | 19 +++++----
 parameters/delphes.yaml      | 23 +++++-----
 scripts/test_load_tfmodel.py |  2 +-
 9 files changed, 127 insertions(+), 110 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 32d7546e1..34c6411b0 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -50,6 +50,7 @@
     parse_config,
     get_best_checkpoint,
     delete_all_but_best_checkpoint,
+    classwise_energy_normalization
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -140,6 +141,10 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
 
+    # for X, y, w in ds_train_r:
+    #     classwise_energy_normalization(X,y,w)
+    #     break
+
     #FIXME: split up training/test and validation dataset and parameters
     dataset_def.padded_num_elem_size = 6400
 
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index ad2ebc5b9..2d82d9eec 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -149,8 +149,8 @@ def call(self, X):
         Xpt_0p5 = tf.math.sqrt(Xpt)
         Xpt_2 = tf.math.pow(Xpt, 2)
 
-        Xeta1 = tf.expand_dims(tf.sinh(X[:, :, 2]), axis=-1)
-        Xeta2 = tf.expand_dims(tf.cosh(X[:, :, 2]), axis=-1)
+        Xeta1 = tf.clip_by_value(tf.expand_dims(tf.sinh(X[:, :, 2]), axis=-1), -10, 10)
+        Xeta2 = tf.clip_by_value(tf.expand_dims(tf.cosh(X[:, :, 2]), axis=-1), -10, 10)
         Xabs_eta = tf.expand_dims(tf.math.abs(X[:, :, 2]), axis=-1)
         Xphi1 = tf.expand_dims(tf.sin(X[:, :, 3]), axis=-1)
         Xphi2 = tf.expand_dims(tf.cos(X[:, :, 3]), axis=-1)
@@ -160,6 +160,8 @@ def call(self, X):
         Xe_0p5 = tf.math.sqrt(log_energy)
         Xe_2 = tf.math.pow(log_energy, 2)
 
+        Xe_transverse = log_energy - tf.math.log(Xeta2)
+
         Xlayer = tf.expand_dims(X[:, :, 5]*10.0, axis=-1)
         Xdepth = tf.expand_dims(X[:, :, 6]*10.0, axis=-1)
 
@@ -175,6 +177,7 @@ def call(self, X):
             Xabs_eta,
             Xphi1, Xphi2,
             Xe, Xe_0p5, Xe_2,
+            Xe_transverse,
             Xlayer, Xdepth,
             Xphi_ecal1, Xphi_ecal2,
             Xphi_hcal1, Xphi_hcal2,
@@ -424,8 +427,9 @@ def __init__(self,
         energy_num_layers=3,
 
         layernorm=False,
-
+        mask_reg_cls0=True,
         **kwargs):
+
         super(OutputDecoding, self).__init__(**kwargs)
 
         self.regression_use_classification = regression_use_classification
@@ -437,9 +441,11 @@ def __init__(self,
         self.phi_skip_gate = phi_skip_gate
         self.energy_skip_gate = energy_skip_gate
 
+        self.mask_reg_cls0 = mask_reg_cls0
+
         self.do_layernorm = layernorm
         if self.do_layernorm:
-            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1)
+            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, name="output_layernorm")
 
         self.ffn_id = point_wise_feed_forward_network(
             num_output_classes, id_hidden_dim,
@@ -484,6 +490,11 @@ def __init__(self,
             dropout=dropout
         )
 
+        # self.classwise_energy_means = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_means",
+        #     initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.01), trainable=True)
+        # self.classwise_energy_stds = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_stds",
+        #     initializer=tf.keras.initializers.RandomNormal(mean=1, stddev=0.01), trainable=True)
+
     """
     X_input: (n_batch, n_elements, n_input_features) raw node input features
     X_encoded: (n_batch, n_elements, n_encoded_features) encoded/transformed node features
@@ -497,7 +508,9 @@ def call(self, args, training=False):
             X_encoded = self.layernorm(X_encoded)
 
         out_id_logits = self.ffn_id(X_encoded, training)*msk_input
+
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
+        out_id_hard_softmax = tf.clip_by_value(tf.stop_gradient(tf.nn.softmax(10*out_id_logits)), 0, 1)
         out_charge = self.ffn_charge(X_encoded, training)*msk_input
 
         orig_eta = X_input[:, :, 2:3]
@@ -514,26 +527,25 @@ def call(self, args, training=False):
             orig_log_energy = tf.math.log(X_input[:, :, 5:6] + 1.0)*msk_input
 
         if self.regression_use_classification:
-            X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_softmax)], axis=-1)
+            X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_logits)], axis=-1)
 
         pred_eta_corr = self.ffn_eta(X_encoded, training)*msk_input
         pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input
 
         if self.eta_skip_gate:
             eta_gate = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
+            pred_eta = orig_eta + eta_gate*pred_eta_corr[:, :, 1:2]
         else:
-            eta_gate = 1.0
-        pred_eta = orig_eta + eta_gate*pred_eta_corr[:, :, 1:2]
-
+            pred_eta = orig_eta*pred_eta_corr[:, :, 0:1] + eta_gate*pred_eta_corr[:, :, 1:2]
+        
         if self.phi_skip_gate:
             sin_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
             cos_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
+            pred_sin_phi = orig_sin_phi + sin_phi_gate*pred_phi_corr[:, :, 1:2]
+            pred_cos_phi = orig_cos_phi + cos_phi_gate*pred_phi_corr[:, :, 3:4]
         else:
-            sin_phi_gate = 1.0
-            cos_phi_gate = 1.0
-
-        pred_sin_phi = orig_sin_phi + sin_phi_gate*pred_phi_corr[:, :, 1:2]
-        pred_cos_phi = orig_cos_phi + cos_phi_gate*pred_phi_corr[:, :, 3:4]
+            pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + sin_phi_gate*pred_phi_corr[:, :, 1:2]
+            pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + cos_phi_gate*pred_phi_corr[:, :, 3:4]
 
         X_encoded = tf.concat([X_encoded, tf.stop_gradient(pred_eta)], axis=-1)
         pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
@@ -541,11 +553,14 @@ def call(self, args, training=False):
 
         if self.energy_skip_gate:
             energy_gate = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
+            pred_log_energy = orig_log_energy + energy_gate*pred_energy_corr[:, :, 1:2]
         else:
-            energy_gate = 1.0
+            pred_log_energy = orig_log_energy*pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]
+
+        # pred_log_energy = pred_log_energy - tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_means, axis=-1, keepdims=True)
+        # pred_log_energy = pred_log_energy / tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_stds, axis=-1, keepdims=True)
 
         #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
-        pred_log_energy = orig_log_energy + energy_gate*pred_energy_corr[:, :, 1:2]
         pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
 
         #compute pt=E/cosh(eta)
@@ -554,20 +569,27 @@ def call(self, args, training=False):
 
         if self.pt_skip_gate:
             pt_gate = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
+            pred_log_pt = orig_log_pt + pt_gate*pred_pt_corr[:, :, 1:2]
         else:
-            pt_gate = 1.0
-        pred_log_pt = orig_log_pt + pt_gate*pred_pt_corr[:, :, 1:2]
-
-        msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_softmax, axis=-1)!=0, tf.float32), axis=-1)
+            pred_log_pt = orig_log_pt*pred_pt_corr[:, :, 0:1] + pt_gate*pred_pt_corr[:, :, 1:2]
+        
+        if self.mask_reg_cls0:
+            msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_hard_softmax, axis=-1)!=0, tf.float32), axis=-1)
+            out_charge = out_charge*msk_output
+            pred_log_pt = pred_log_pt*msk_output
+            pred_eta = pred_eta*msk_output
+            pred_sin_phi = pred_sin_phi*msk_output
+            pred_cos_phi = pred_cos_phi*msk_output
+            pred_log_energy = pred_log_energy*msk_output
 
         ret = {
             "cls": out_id_softmax,
-            "charge": out_charge*msk_input*msk_output,
-            "pt": pred_log_pt*msk_input*msk_output,
-            "eta": pred_eta*msk_input*msk_output,
-            "sin_phi": pred_sin_phi*msk_input*msk_output,
-            "cos_phi": pred_cos_phi*msk_input*msk_output,
-            "energy": pred_log_energy*msk_input*msk_output,
+            "charge": out_charge*msk_input,
+            "pt": pred_log_pt*msk_input,
+            "eta": pred_eta*msk_input,
+            "sin_phi": pred_sin_phi*msk_input,
+            "cos_phi": pred_cos_phi*msk_input,
+            "energy": pred_log_energy*msk_input,
         }
 
         return ret
@@ -578,8 +600,14 @@ def set_trainable_named(self, layer_names):
         for layer in self.layers:
             layer.trainable = False
 
+        layer_names = [l.name for l in self.layers]
         for layer in layer_names:
-            self.get_layer(layer).trainable = True
+            if layer in layer_names:
+                #it's a layer
+                self.get_layer(layer).trainable = True
+            else:
+                #it's a weight
+                getattr(self, layer).trainable = True
 
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
@@ -595,7 +623,7 @@ def __init__(self, *args, **kwargs):
         self.hidden_dim = kwargs.pop("hidden_dim")
 
         if self.do_layernorm:
-            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
+            self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6, name=kwargs.get("name")+"_layernorm")
 
         self.ffn_dist = point_wise_feed_forward_network(
             self.distance_dim,
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 1909f07b9..d6aedb106 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -74,7 +74,7 @@ def delete_all_but_best_checkpoint(train_dir, dry_run):
     if len(checkpoint_list) == 1:
         raise UserWarning("There is only one checkpoint. No deletion was made.")
     elif len(checkpoint_list) == 0:
-        raise UserWarning("Couldn't find ant checkpoints. No deletion was made.")
+        raise UserWarning("Couldn't find any checkpoints. No deletion was made.")
     else:
         # Sort the checkpoints according to the loss in their filenames
         checkpoint_list.sort(key=lambda x: float(re.search("\d+-\d+.\d+", str(x))[0].split("-")[-1]))
@@ -153,7 +153,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
@@ -196,6 +196,14 @@ def func(X, y, w):
 
     return func
 
+def classwise_energy_normalization(X,y,w):
+    mean_energies = tf.constant([1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.float32)
+
+    energy_sub = y["cls"]*mean_energies
+
+    import pdb;pdb.set_trace()
+
+    return X,y,w
 
 def get_dataset_def(config):
     cds = config["dataset"]
@@ -258,6 +266,9 @@ def get_train_val_datasets(config, global_batch_size, n_train, n_test, repeat=Tr
     else:
         dataset_transform = None
 
+    # ds_train = ds_train.map(classwise_energy_normalization)
+    # ds_test = ds_train.map(classwise_energy_normalization)
+
     if repeat:
         ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
         ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 54426da53..9e5c3b956 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -102,39 +102,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vals_a = X_val[:, :, 2].flatten()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vals_b = ycand_val[\"eta\"][:, :, 0].flatten()\n",
-    "vals_c = ygen_val[\"eta\"][:, :, 0].flatten()\n",
-    "\n",
-    "cls_cand = np.argmax(ycand_val[\"cls\"], axis=-1).flatten()\n",
-    "cls_gen = np.argmax(ygen_val[\"cls\"], axis=-1).flatten()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "msk = (np.argmax(ycand_val[\"cls\"], axis=-1).flatten()==6) & (np.argmax(ygen_val[\"cls\"], axis=-1).flatten()==6)\n",
-    "plt.scatter(vals_a[msk], vals_c[msk], marker=\".\", alpha=0.2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vals_a[msk]"
+    "ret = model(X_val[:1])\n",
+    "#model.set_trainable_classification()\n",
+    "model.load_weights(\"/home/joosep/particleflow/experiments/cms_20210827_131712.joosep-desktop/weights/weights-07-477.625885.hdf5\")\n",
+    "ret = model.predict(X_val, batch_size=1, verbose=1)"
    ]
   },
   {
@@ -143,7 +114,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vals_b[msk]"
+    "x = X_val[0]\n",
+    "msk = x[:, 0] == 8"
    ]
   },
   {
@@ -152,10 +124,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ret = model(X_val[:1])\n",
-    "#model.set_trainable_classification()\n",
-    "model.load_weights(\"/home/joosep/particleflow/experiments/cms_20210823_110858.joosep-desktop/weights/weights-02-95.751160.hdf5\")\n",
-    "ret = model.predict(X_val, batch_size=1, verbose=1)"
+    "model.output_dec.classwise_energy_means"
    ]
   },
   {
@@ -164,8 +133,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "x = X_val[0]\n",
-    "msk = x[:, 0] == 8"
+    "model.output_dec.classwise_energy_stds"
    ]
   },
   {
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index dd46b1025..e7266ebfd 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -109,31 +109,32 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: no
+    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
-    pt_dim_decrease: no
-    eta_dim_decrease: no
-    phi_dim_decrease: no
-    energy_dim_decrease: no
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
 
-    id_hidden_dim: 1024
+    id_hidden_dim: 256
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 1024
+    energy_hidden_dim: 256
 
     id_num_layers: 4
     charge_num_layers: 2
     pt_num_layers: 3
     eta_num_layers: 3
     phi_num_layers: 3
-    energy_num_layers: 4
+    energy_num_layers: 3
     layernorm: yes
+    mask_reg_cls0: yes
+
   skip_connection: yes
-  regression_use_classification: yes
   debug: no
 
 timing:
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index 8aac1e304..150c26816 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -57,10 +57,10 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-3
   batch_size: 4
-  num_events_train: 80000
-  num_events_test: 9000
+  num_events_train: 1000
+  num_events_test: 100
   num_epochs: 100
   num_val_files: 10
   dtype: float32
@@ -105,29 +105,31 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: no
+    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
-    pt_dim_decrease: no
-    eta_dim_decrease: no
-    phi_dim_decrease: no
-    energy_dim_decrease: no
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
 
-    id_hidden_dim: 1024
+    id_hidden_dim: 256
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 1024
+    energy_hidden_dim: 256
 
     id_num_layers: 4
     charge_num_layers: 2
     pt_num_layers: 3
     eta_num_layers: 3
     phi_num_layers: 3
-    energy_num_layers: 4
+    energy_num_layers: 3
     layernorm: yes
+    mask_reg_cls0: yes
+
   skip_connection: yes
   debug: no
 
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index cd3a09e62..bc4e2c466 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -105,31 +105,32 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: no
+    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
-    pt_dim_decrease: no
-    eta_dim_decrease: no
-    phi_dim_decrease: no
-    energy_dim_decrease: no
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
 
-    id_hidden_dim: 1024
+    id_hidden_dim: 256
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 1024
+    energy_hidden_dim: 256
 
     id_num_layers: 4
     charge_num_layers: 2
     pt_num_layers: 3
     eta_num_layers: 3
     phi_num_layers: 3
-    energy_num_layers: 4
+    energy_num_layers: 3
     layernorm: yes
+    mask_reg_cls0: yes
+
   skip_connection: yes
-  regression_use_classification: yes
   debug: no
 
 timing:
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 90a713665..b9ac417aa 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -45,7 +45,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-3
   batch_size: 4
   num_events_train: 45000
   num_events_test: 5000
@@ -68,7 +68,7 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: default
-  activation: elu
+  activation: gelu
   layernorm: yes
   hidden_dim: 256
   bin_size: 320
@@ -83,6 +83,7 @@ parameters:
     type: GHConvDense
     output_dim: 256
     activation: elu
+    normalize_degrees: yes
   num_node_messages: 1
   output_decoding:
     activation: gelu
@@ -92,31 +93,31 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: no
+    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
-    pt_dim_decrease: no
-    eta_dim_decrease: no
-    phi_dim_decrease: no
-    energy_dim_decrease: no
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
 
-    id_hidden_dim: 1024
+    id_hidden_dim: 256
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 1024
+    energy_hidden_dim: 256
 
     id_num_layers: 4
     charge_num_layers: 2
     pt_num_layers: 3
     eta_num_layers: 3
     phi_num_layers: 3
-    energy_num_layers: 4
+    energy_num_layers: 3
     layernorm: yes
+    mask_reg_cls0: yes
   skip_connection: yes
-  regression_use_classification: yes
   debug: no
 
 timing:
diff --git a/scripts/test_load_tfmodel.py b/scripts/test_load_tfmodel.py
index 2e30b6346..754921e2a 100644
--- a/scripts/test_load_tfmodel.py
+++ b/scripts/test_load_tfmodel.py
@@ -2,7 +2,7 @@
 import sys
 import numpy as np
 
-bin_size = 640
+bin_size = 320
 num_features = 15
 
 def load_graph(frozen_graph_filename):

From 2b9f218cad7a7d6764ae139e8f6090a68cf858d5 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 27 Aug 2021 14:56:07 +0200
Subject: [PATCH 123/157] feat: Produce Ray analysis plots in raytune command

---
 mlpf/pipeline.py              | 76 ++++++++++++++++++++++++++++++-----
 parameters/cms-gnn-dense.yaml |  2 +-
 2 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index c0b47b0b4..00625b5f5 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -14,6 +14,7 @@
 from functools import partial
 import shlex
 import subprocess
+import matplotlib.pyplot as plt
 
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
@@ -62,6 +63,7 @@
 from ray.tune.integration.keras import TuneReportCheckpointCallback
 from ray.tune.integration.tensorflow import DistributedTrainableCreator
 from ray.tune.logger import TBXLoggerCallback
+from ray.tune import Analysis
 
 
 @click.group()
@@ -474,6 +476,51 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None):
             )
 
 
+def get_hp_str(result):
+    def func(key):
+        if "config" in key:
+            return key.split("config/")[-1]
+    s = ""
+    for ii, hp in enumerate(list(filter(None.__ne__, [func(key) for key in result.keys()]))):
+        if ii % 6 == 0:
+            s += "\n"
+        s += "{}={}; ".format(hp, result["config/{}".format(hp)].values[0])
+    return s
+
+def plot_ray_analysis(analysis, save=False):
+    to_plot = [
+    'adam_beta_1', 'charge_loss', 'cls_acc_unweighted', 'cls_loss',
+       'cos_phi_loss', 'energy_loss', 'eta_loss', 'learning_rate', 'loss',
+       'pt_loss', 'sin_phi_loss', 'val_charge_loss',
+       'val_cls_acc_unweighted', 'val_cls_acc_weighted', 'val_cls_loss',
+       'val_cos_phi_loss', 'val_energy_loss', 'val_eta_loss', 'val_loss',
+       'val_pt_loss', 'val_sin_phi_loss',
+    ]
+
+    dfs = analysis.fetch_trial_dataframes()
+    result_df = analysis.dataframe()
+    for key in tqdm(dfs.keys(), desc="Creating Ray analysis plots", total=len(dfs.keys())):
+        result = result_df[result_df["logdir"] == key]
+
+        fig, axs = plt.subplots(4, 4, figsize=(12, 9), tight_layout=True)
+        for ax in axs.flat:
+            ax.label_outer()
+
+        for var, ax in zip(to_plot, axs.flat):
+            ax.plot(dfs[key].index.values, dfs[key][var], alpha=0.8)
+            ax.set_xlabel("Epoch")
+            ax.set_ylabel(var)
+            ax.grid(alpha=0.3)
+        plt.suptitle(get_hp_str(result))
+
+        if save:
+            plt.savefig(key + "/trial_summary.jpg")
+    if not save:
+        plt.show()
+    else:
+        print("Saved plots in trial dirs.")
+
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
@@ -481,11 +528,19 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None):
 @click.option("-l", "--local", help="run locally", is_flag=True)
 @click.option("--cpus", help="number of cpus per worker", type=int, default=1)
 @click.option("--gpus", help="number of gpus per worker", type=int, default=0)
-def raytune(config, name, local, cpus, gpus):
+@click.option("--tune_result_dir", help="Tune result dir", type=str, default=None)
+def raytune(config, name, local, cpus, gpus, tune_result_dir):
     cfg = load_config(config)
+    config_file_path = config
+
+    if tune_result_dir is not None:
+        os.environ["TUNE_RESULT_DIR"] = tune_result_dir
+    else:
+        trd = cfg["raytune"]["local_dir"] + "/tune_result_dir"
+        os.environ["TUNE_RESULT_DIR"] = trd
+
     if not local:
         ray.init(address='auto')
-    config_file_path = config
 
     search_space = {
         # Optimizer parameters
@@ -520,22 +575,25 @@ def raytune(config, name, local, cpus, gpus):
         config=search_space,
         name=name,
         scheduler=sched,
-        # metric="val_loss",
-        # mode="min",
-        # stop={"training_iteration": 32},
         num_samples=1,
-        # resources_per_trial={
-        #     "cpu": 16,
-        #     "gpu": 4
-        # },
         local_dir=cfg["raytune"]["local_dir"],
         callbacks=[TBXLoggerCallback()],
         log_to_file=True,
     )
     print("Best hyperparameters found were: ", analysis.get_best_config("val_loss", "min"))
 
+    plot_ray_analysis(analysis, save=True)
     ray.shutdown()
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-d", "--exp_dir", help="experiment dir", type=click.Path())
+@click.option("-s", "--save", help="save plots in trial dirs", is_flag=True)
+def raytune_analysis(exp_dir, save):
+    analysis = Analysis(exp_dir)
+    plot_ray_analysis(analysis, save=save)
+
+
 if __name__ == "__main__":
     main()
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index e691827dd..e61024689 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -125,7 +125,7 @@ hypertune:
     executions_per_trial: 1
 
 raytune:
-  local_dir:
+  local_dir:  # Note: please specify an absolute path
   sched: "asha"  # asha, hyperband
   parameters:
     # optimizer parameters

From 0956d54ae5f9f0ff3db46ff3fb34c973e11c14e4 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 27 Aug 2021 14:57:30 +0200
Subject: [PATCH 124/157] fix: Add check for empty CUDA_VISIBLE_DEVICES

---
 mlpf/tfmodel/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index baf00952b..e886b6cbb 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -89,7 +89,12 @@ def delete_all_but_best_checkpoint(train_dir, dry_run):
 
 
 def get_strategy(global_batch_size):
-    gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "-1").split(",")]
+    if isinstance(os.environ.get("CUDA_VISIBLE_DEVICES"), type(None)) or len(os.environ.get("CUDA_VISIBLE_DEVICES")) == 0:
+        gpus = [-1]
+        print("WARNING: CUDA_VISIBLE_DEVICES variable is empty. \
+            If you don't have or intend to use GPUs, this message can be ignored.")
+    else:
+        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "-1").split(",")]
     if gpus[0] == -1:
         num_gpus = 0
     else:

From 9f22cb6d64194172b47a9b3315d6941503e116be Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 16:41:13 +0300
Subject: [PATCH 125/157] fix num events

---
 parameters/cms-dev.yaml | 6 +++---
 parameters/cms-gen.yaml | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index e7266ebfd..61f0ead20 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -2,7 +2,7 @@ backend: tensorflow
 
 dataset:
   schema: cms
-  target_particles: cand
+  target_particles: gen
   num_input_features: 15
   num_output_features: 7
 #       NONE = 0,
@@ -31,7 +31,7 @@ dataset:
   cos_phi_loss_coef: 100.0
   energy_loss_coef: 100.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
@@ -60,7 +60,7 @@ setup:
   lr: 1e-3
   batch_size: 4
   num_events_train: 80000
-  num_events_test: 10000
+  num_events_test: 9000
   num_epochs: 100
   num_val_files: 10
   dtype: float32
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index 150c26816..109205177 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -59,8 +59,8 @@ setup:
   weights_config:
   lr: 1e-3
   batch_size: 4
-  num_events_train: 1000
-  num_events_test: 100
+  num_events_train: 80000
+  num_events_test: 9000
   num_epochs: 100
   num_val_files: 10
   dtype: float32

From b15b6d75274642f3bc15fd68e906fb92a360fbc0 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Fri, 27 Aug 2021 16:47:38 +0300
Subject: [PATCH 126/157] change epochs

---
 mlpf/tallinn/cms-gen.sh | 2 +-
 parameters/cms-gen.yaml | 2 +-
 parameters/cms.yaml     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlpf/tallinn/cms-gen.sh b/mlpf/tallinn/cms-gen.sh
index f119a0df1..d2eddde9f 100755
--- a/mlpf/tallinn/cms-gen.sh
+++ b/mlpf/tallinn/cms-gen.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH -p gpu
-#SBATCH --gpus 1
+#SBATCH --gpus 4
 #SBATCH --mem-per-gpu=8G
 
 IMG=/home/software/singularity/base.simg:latest
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index 109205177..cf4a5097a 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -61,7 +61,7 @@ setup:
   batch_size: 4
   num_events_train: 80000
   num_events_test: 9000
-  num_epochs: 100
+  num_epochs: 40
   num_val_files: 10
   dtype: float32
   trainable:
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index bc4e2c466..b066fd947 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -61,7 +61,7 @@ setup:
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 100
+  num_epochs: 40
   num_val_files: 10
   dtype: float32
   trainable:

From 88cf8583257a9810ed01d0537fad7cac2f2c20f6 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Fri, 27 Aug 2021 17:30:13 +0300
Subject: [PATCH 127/157] add microsecond for simultaneous start

---
 mlpf/tfmodel/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index d6aedb106..056c05e9f 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -48,9 +48,9 @@ def parse_config(config, ntrain=None, ntest=None, weights=None):
 
 def create_experiment_dir(prefix=None, suffix=None):
     if prefix is None:
-        train_dir = Path("experiments") / datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        train_dir = Path("experiments") / datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     else:
-        train_dir = Path("experiments") / (prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
+        train_dir = Path("experiments") / (prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f"))
 
     if suffix is not None:
         train_dir = train_dir.with_name(train_dir.name + "." + platform.node())

From 4261cf4f4501475d11a54956a543428b87a58975 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Aug 2021 22:53:45 +0300
Subject: [PATCH 128/157] added additional energy graph layer

---
 mlpf/tfmodel/model.py       | 55 ++++++++++++++++++---------
 mlpf/tfmodel/model_setup.py |  4 +-
 mlpf/tfmodel/utils.py       |  2 +-
 notebooks/pfnet-debug.ipynb | 74 ++++++++++++++++++++++++++++++++++++-
 parameters/cms-gen.yaml     |  6 +--
 parameters/cms.yaml         | 13 ++++---
 parameters/delphes.yaml     |  3 +-
 7 files changed, 127 insertions(+), 30 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 2d82d9eec..913aa791d 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -485,15 +485,15 @@ def __init__(self,
         )
 
         self.ffn_energy = point_wise_feed_forward_network(
-            2, energy_hidden_dim, "ffn_energy",
+            4, energy_hidden_dim, "ffn_energy",
             dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
             dropout=dropout
         )
 
-        # self.classwise_energy_means = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_means",
-        #     initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.01), trainable=True)
-        # self.classwise_energy_stds = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_stds",
-        #     initializer=tf.keras.initializers.RandomNormal(mean=1, stddev=0.01), trainable=True)
+        self.classwise_energy_means = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_means",
+            initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.1), trainable=True)
+        self.classwise_energy_stds = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_stds",
+            initializer=tf.keras.initializers.RandomNormal(mean=1, stddev=0.1), trainable=True)
 
     """
     X_input: (n_batch, n_elements, n_input_features) raw node input features
@@ -502,15 +502,15 @@ def __init__(self,
     """
     def call(self, args, training=False):
 
-        X_input, X_encoded, msk_input = args
+        X_input, X_encoded, X_encoded_energy, msk_input = args
 
         if self.do_layernorm:
             X_encoded = self.layernorm(X_encoded)
 
         out_id_logits = self.ffn_id(X_encoded, training)*msk_input
 
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits), 0, 1)
-        out_id_hard_softmax = tf.clip_by_value(tf.stop_gradient(tf.nn.softmax(10*out_id_logits)), 0, 1)
+        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits, axis=-1), 0, 1)
+        out_id_hard_softmax = tf.clip_by_value(tf.stop_gradient(tf.nn.softmax(10*out_id_logits, axis=-1)), 0, 1)
         out_charge = self.ffn_charge(X_encoded, training)*msk_input
 
         orig_eta = X_input[:, :, 2:3]
@@ -547,18 +547,23 @@ def call(self, args, training=False):
             pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + sin_phi_gate*pred_phi_corr[:, :, 1:2]
             pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + cos_phi_gate*pred_phi_corr[:, :, 3:4]
 
-        X_encoded = tf.concat([X_encoded, tf.stop_gradient(pred_eta)], axis=-1)
-        pred_energy_corr = self.ffn_energy(X_encoded, training)*msk_input
-        pred_pt_corr = self.ffn_pt(X_encoded, training)*msk_input
+        if self.regression_use_classification:
+            X_encoded_energy = tf.concat([X_encoded_energy, tf.stop_gradient(out_id_logits)], axis=-1)
+
+        pred_energy_corr = self.ffn_energy(X_encoded_energy, training)*msk_input
+        pred_pt_corr = self.ffn_pt(X_encoded_energy, training)*msk_input
 
         if self.energy_skip_gate:
             energy_gate = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-            pred_log_energy = orig_log_energy + energy_gate*pred_energy_corr[:, :, 1:2]
+            energy_corr = energy_gate*pred_energy_corr[:, :, 1:2]
+            energy_corr = energy_corr - tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_means, axis=-1, keepdims=True)
+            energy_corr = energy_corr / tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_stds, axis=-1, keepdims=True)
+            pred_log_energy = orig_log_energy + energy_corr
         else:
-            pred_log_energy = orig_log_energy*pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]
-
-        # pred_log_energy = pred_log_energy - tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_means, axis=-1, keepdims=True)
-        # pred_log_energy = pred_log_energy / tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_stds, axis=-1, keepdims=True)
+            #pred_log_energy = orig_log_energy*pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]
+            pred_log_energy = pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]*orig_log_energy + pred_energy_corr[:, :, 2:3]*orig_log_energy*orig_log_energy + pred_energy_corr[:, :, 3:4]*tf.math.sqrt(orig_log_energy)
+            pred_log_energy = pred_log_energy - tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_means, axis=-1, keepdims=True)
+            pred_log_energy = pred_log_energy / tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_stds, axis=-1, keepdims=True)
 
         #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
         pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
@@ -681,6 +686,7 @@ def __init__(self,
             activation=tf.keras.activations.elu,
             num_node_messages=2,
             num_graph_layers=1,
+            num_graph_layers_energy=1,
             dropout=0.0,
             input_encoding="cms",
             focal_loss_from_logits=False,
@@ -703,6 +709,7 @@ def __init__(self,
 
         self.num_node_messages = num_node_messages
         self.num_graph_layers = num_graph_layers
+        self.num_graph_layers_energy = num_graph_layers_energy
 
         if input_encoding == "cms":
             self.enc = InputEncodingCMS(num_input_classes)
@@ -722,6 +729,7 @@ def __init__(self,
         }
 
         self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
+        self.cg_energy = [CombinedGraphLayer(name="cg_energy_{}".format(i), **kwargs_cg) for i in range(num_graph_layers_energy)]
 
         output_decoding["schema"] = schema
         output_decoding["num_output_classes"] = num_output_classes
@@ -755,7 +763,20 @@ def call(self, inputs, training=False):
         if self.debug:
             debugging_data["dec_output"] = dec_output
 
-        ret = self.output_dec([X, dec_output, msk_input], training)
+        enc_cg = enc
+        encs_energy = []
+        for cg in self.cg_energy:
+            enc_all = cg(enc_cg, msk, training)
+            enc_cg = enc_all["enc"]
+            if self.debug:
+                debugging_data[cg.name] = enc_all
+            encs_energy.append(enc_cg)
+
+        dec_output_energy = tf.concat(encs_energy, axis=-1)*msk_input
+        if self.debug:
+            debugging_data["dec_output_energy"] = dec_output
+
+        ret = self.output_dec([X, dec_output, dec_output_energy, msk_input], training)
 
         if self.debug:
             for k in debugging_data.keys():
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index dce7d6376..6d0a62789 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -272,7 +272,8 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         residual = vals_true - vals_pred
         residual[np.isnan(residual)] = 0
         residual[np.isinf(residual)] = 0
-        plt.hist(residual, bins=100)
+        plt.hist(residual, bins=np.linspace(-2,2,100))
+        plt.yscale("log")
         plt.xlabel("true - pred")
         plt.title("{} residual, m={:.4f} s={:.4f}".format(reg_variable, np.mean(residual), np.std(residual)))
 
@@ -396,6 +397,7 @@ def make_gnn_dense(config, dtype):
         "bin_size",
         "num_node_messages",
         "num_graph_layers",
+        "num_graph_layers_energy",
         "distance_dim",
         "dropout",
         "input_encoding",
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index d6aedb106..f36cbc288 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -153,7 +153,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 9e5c3b956..2ec37eb0d 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -80,7 +80,7 @@
     "ygens = []\n",
     "ycands = []\n",
     "\n",
-    "for fi in dataset_def.val_filelist[:2]:\n",
+    "for fi in dataset_def.val_filelist[:100]:\n",
     "    print(fi)\n",
     "    X, ygen, ycand = dataset_def.prepare_data(fi)\n",
     "\n",
@@ -96,6 +96,78 @@
     "X_val, ygen_val, _ = dataset_transform(X_val, ygen_val, None)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cls_cand = np.argmax(ycand_val[\"cls\"], axis=-1)\n",
+    "cls_gen = np.argmax(ygen_val[\"cls\"], axis=-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cm = sklearn.metrics.confusion_matrix(cls_gen[X_val[:, :, 0]!=0], cls_cand[X_val[:, :, 0]!=0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cls_id = 5\n",
+    "ngen = np.sum(cls_gen==cls_id, axis=1)\n",
+    "ncand = np.sum(cls_cand==cls_id, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.scatter(ngen, ncand)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cls_id = 4\n",
+    "variable = \"sin_phi\"\n",
+    "gen_energy = ygen_val[variable][(cls_cand==cls_id) & (cls_gen==cls_id)][:, 0].numpy()\n",
+    "cand_energy = ycand_val[variable][(cls_cand==cls_id) & (cls_gen==cls_id)][:, 0].numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "residual = gen_energy-cand_energy\n",
+    "plt.hist(residual, bins=100);\n",
+    "plt.xlabel(\"gen - PF\")\n",
+    "print(np.mean(residual), np.std(residual))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index cf4a5097a..20fa2dfa9 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -57,7 +57,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 4
   num_events_train: 80000
   num_events_test: 9000
@@ -85,7 +85,7 @@ parameters:
   hidden_dim: 256
   bin_size: 320
   distance_dim: 128
-  dropout: 0.0
+  dropout: 0.2
   graph_kernel:
     type: NodePairGaussianKernel
     dist_mult: 0.1
@@ -100,7 +100,7 @@ parameters:
   output_decoding:
     activation: gelu
     regression_use_classification: yes
-    dropout: 0.0
+    dropout: 0.2
 
     pt_skip_gate: yes
     eta_skip_gate: yes
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index b066fd947..aa9eb7982 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -81,7 +81,7 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   activation: gelu
-  layernorm: yes
+  layernorm: no
   hidden_dim: 256
   bin_size: 320
   distance_dim: 128
@@ -90,7 +90,8 @@ parameters:
     type: NodePairGaussianKernel
     dist_mult: 0.1
     clip_value_low: 0.0
-  num_graph_layers: 5
+  num_graph_layers: 3
+  num_graph_layers_energy: 3
   node_message:
     type: GHConvDense
     output_dim: 256
@@ -105,7 +106,7 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: yes
+    energy_skip_gate: no
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
@@ -119,7 +120,7 @@ parameters:
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 256
+    energy_hidden_dim: 1024
 
     id_num_layers: 4
     charge_num_layers: 2
@@ -127,8 +128,8 @@ parameters:
     eta_num_layers: 3
     phi_num_layers: 3
     energy_num_layers: 3
-    layernorm: yes
-    mask_reg_cls0: yes
+    layernorm: no
+    mask_reg_cls0: no
 
   skip_connection: yes
   debug: no
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index b9ac417aa..c19cfc191 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -45,7 +45,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 4
   num_events_train: 45000
   num_events_test: 5000
@@ -79,6 +79,7 @@ parameters:
     dist_mult: 0.1
     clip_value_low: 0.0
   num_graph_layers: 5
+  num_graph_layers_energy: 1
   node_message:
     type: GHConvDense
     output_dim: 256

From 7b7cce15efeb2fa0c2b3499504bd33ff057029e8 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sat, 28 Aug 2021 17:55:17 +0300
Subject: [PATCH 129/157] custom train step for regression

---
 mlpf/tfmodel/model.py       | 138 ++++++---------
 mlpf/tfmodel/model_setup.py |  19 +-
 mlpf/tfmodel/utils.py       |   2 +-
 notebooks/pfnet-debug.ipynb | 336 +++++++++++++++++-------------------
 parameters/cms.yaml         |  46 ++---
 parameters/delphes.yaml     |  40 +++--
 6 files changed, 267 insertions(+), 314 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 913aa791d..d183a25a7 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -490,10 +490,11 @@ def __init__(self,
             dropout=dropout
         )
 
-        self.classwise_energy_means = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_means",
-            initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.1), trainable=True)
-        self.classwise_energy_stds = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_stds",
-            initializer=tf.keras.initializers.RandomNormal(mean=1, stddev=0.1), trainable=True)
+        if not self.energy_skip_gate:
+            self.classwise_energy_means = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_means",
+                initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.1), trainable=True)
+            self.classwise_energy_stds = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_stds",
+                initializer=tf.keras.initializers.RandomNormal(mean=1, stddev=0.1), trainable=True)
 
     """
     X_input: (n_batch, n_elements, n_input_features) raw node input features
@@ -507,11 +508,11 @@ def call(self, args, training=False):
         if self.do_layernorm:
             X_encoded = self.layernorm(X_encoded)
 
-        out_id_logits = self.ffn_id(X_encoded, training)*msk_input
+        out_id_logits = self.ffn_id(X_encoded, training=training)*msk_input
 
         out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits, axis=-1), 0, 1)
-        out_id_hard_softmax = tf.clip_by_value(tf.stop_gradient(tf.nn.softmax(10*out_id_logits, axis=-1)), 0, 1)
-        out_charge = self.ffn_charge(X_encoded, training)*msk_input
+        out_id_hard_softmax = tf.clip_by_value(tf.stop_gradient(tf.nn.softmax(100*out_id_logits, axis=-1)), 0, 1)
+        out_charge = self.ffn_charge(X_encoded, training=training)*msk_input
 
         orig_eta = X_input[:, :, 2:3]
 
@@ -529,8 +530,8 @@ def call(self, args, training=False):
         if self.regression_use_classification:
             X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_logits)], axis=-1)
 
-        pred_eta_corr = self.ffn_eta(X_encoded, training)*msk_input
-        pred_phi_corr = self.ffn_phi(X_encoded, training)*msk_input
+        pred_eta_corr = self.ffn_eta(X_encoded, training=training)*msk_input
+        pred_phi_corr = self.ffn_phi(X_encoded, training=training)*msk_input
 
         if self.eta_skip_gate:
             eta_gate = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
@@ -550,14 +551,12 @@ def call(self, args, training=False):
         if self.regression_use_classification:
             X_encoded_energy = tf.concat([X_encoded_energy, tf.stop_gradient(out_id_logits)], axis=-1)
 
-        pred_energy_corr = self.ffn_energy(X_encoded_energy, training)*msk_input
-        pred_pt_corr = self.ffn_pt(X_encoded_energy, training)*msk_input
+        pred_energy_corr = self.ffn_energy(X_encoded_energy, training=training)*msk_input
+        pred_pt_corr = self.ffn_pt(X_encoded_energy, training=training)*msk_input
 
         if self.energy_skip_gate:
             energy_gate = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
             energy_corr = energy_gate*pred_energy_corr[:, :, 1:2]
-            energy_corr = energy_corr - tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_means, axis=-1, keepdims=True)
-            energy_corr = energy_corr / tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_stds, axis=-1, keepdims=True)
             pred_log_energy = orig_log_energy + energy_corr
         else:
             #pred_log_energy = orig_log_energy*pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]
@@ -626,15 +625,17 @@ def __init__(self, *args, **kwargs):
         self.kernel = kwargs.pop("kernel")
         self.node_message = kwargs.pop("node_message")
         self.hidden_dim = kwargs.pop("hidden_dim")
+        self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
 
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6, name=kwargs.get("name")+"_layernorm")
 
+        #self.gaussian_noise = tf.keras.layers.GaussianNoise(0.01)
         self.ffn_dist = point_wise_feed_forward_network(
             self.distance_dim,
             self.hidden_dim,
             kwargs.get("name") + "_ffn_dist",
-            num_layers=2, activation="elu",
+            num_layers=2, activation=self.activation,
             dropout=self.dropout
         )
         self.message_building_layer = MessageBuildingLayerLSH(
@@ -658,16 +659,20 @@ def call(self, x, msk, training=False):
             x = self.layernorm(x, training=training)
 
         #compute node features for graph building
-        x_dist = self.ffn_dist(x)
+        x_dist = self.activation(self.ffn_dist(x, training=training))
 
+        #x_dist = self.gaussian_noise(x_dist, training=training)
         #compute the element-to-element messages / distance matrix / graph structure
         bins_split, x_binned, dm, msk_binned = self.message_building_layer(x_dist, x, msk)
 
         #run the node update with message passing
         for msg in self.message_passing_layers:
             x_binned = msg((x_binned, dm, msk_binned))
+
+            #x_binned = self.gaussian_noise(x_binned, training=training)
+
             if self.dropout_layer:
-                x_binned = self.dropout_layer(x_binned, training)
+                x_binned = self.dropout_layer(x_binned, training=training)
 
         x_enc = reverse_lsh(bins_split, x_binned)
 
@@ -678,21 +683,13 @@ def __init__(self,
             multi_output=False,
             num_input_classes=8,
             num_output_classes=3,
-            max_num_bins=200,
-            bin_size=320,
-            distance_dim=128,
-            hidden_dim=256,
-            layernorm=False,
-            activation=tf.keras.activations.elu,
-            num_node_messages=2,
-            num_graph_layers=1,
+            num_graph_layers_common=1,
             num_graph_layers_energy=1,
-            dropout=0.0,
             input_encoding="cms",
-            focal_loss_from_logits=False,
-            graph_kernel={"type": "NodePairGaussianKernel"},
             skip_connection=True,
-            node_message={"type": "GHConvDense", "activation": "elu", "output_dim": 128, "normalize_degrees": True},
+            graph_kernel={},
+            combined_graph_layer={},
+            node_message={},
             output_decoding={},
             debug=False,
             schema="cms"
@@ -700,36 +697,17 @@ def __init__(self,
         super(PFNetDense, self).__init__()
 
         self.multi_output = multi_output
-        self.activation = activation
-        self.focal_loss_from_logits = focal_loss_from_logits
         self.debug = debug
-        self.do_layernorm = layernorm
 
         self.skip_connection = skip_connection
 
-        self.num_node_messages = num_node_messages
-        self.num_graph_layers = num_graph_layers
-        self.num_graph_layers_energy = num_graph_layers_energy
-
         if input_encoding == "cms":
             self.enc = InputEncodingCMS(num_input_classes)
         elif input_encoding == "default":
             self.enc = InputEncoding(num_input_classes)
 
-        kwargs_cg = {
-            "max_num_bins": max_num_bins,
-            "bin_size": bin_size,
-            "distance_dim": distance_dim,
-            "layernorm": self.do_layernorm,
-            "num_node_messages": self.num_node_messages,
-            "dropout": dropout,
-            "kernel": graph_kernel,
-            "node_message": node_message,
-            "hidden_dim": hidden_dim
-        }
-
-        self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **kwargs_cg) for i in range(num_graph_layers)]
-        self.cg_energy = [CombinedGraphLayer(name="cg_energy_{}".format(i), **kwargs_cg) for i in range(num_graph_layers_energy)]
+        self.cg = [CombinedGraphLayer(name="cg_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_common)]
+        self.cg_energy = [CombinedGraphLayer(name="cg_energy_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_energy)]
 
         output_decoding["schema"] = schema
         output_decoding["num_output_classes"] = num_output_classes
@@ -774,7 +752,7 @@ def call(self, inputs, training=False):
 
         dec_output_energy = tf.concat(encs_energy, axis=-1)*msk_input
         if self.debug:
-            debugging_data["dec_output_energy"] = dec_output
+            debugging_data["dec_output_energy"] = dec_output_energy
 
         ret = self.output_dec([X, dec_output, dec_output_energy, msk_input], training)
 
@@ -795,37 +773,35 @@ def set_trainable_named(self, layer_names):
 
         self.output_dec.set_trainable_named(layer_names)
 
-    ##for eager mode debugging
-    # def train_step(self, data):
-    #     # Unpack the data. Its structure depends on your model and
-    #     # on what you pass to `fit()`.
-    #     x, y, sample_weights = data
-
-    #     with tf.GradientTape() as tape:
-    #         y_pred = self(x, training=True)  # Forward pass
-    #         # Compute the loss value
-    #         # (the loss function is configured in `compile()`)
-    #         loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
-    #         import pdb;pdb.set_trace()
-
-    #     ya = {k: v.numpy() for k, v in y.items()}
-    #     yb = {k: v.numpy() for k, v in y_pred.items()}
-    #     sw = {k: v.numpy() for k, v in sample_weights.items()}
-
-    #     np.savez("ytrue.npz", **ya)
-    #     np.savez("ypred.npz", **yb)
-    #     np.savez("x.npz", x=x)
-    #     np.savez("sample_weights.npz", **sample_weights)
-
-    #     # Compute gradients
-    #     trainable_vars = self.trainable_variables
-    #     gradients = tape.gradient(loss, trainable_vars)
-    #     # Update weights
-    #     self.optimizer.apply_gradients(zip(gradients, trainable_vars))
-    #     # Update metrics (includes the metric that tracks the loss)
-    #     self.compiled_metrics.update_state(y, y_pred)
-    #     # Return a dict mapping metric names to current value
-    #     return {m.name: m.result() for m in self.metrics}
+    def train_step(self, data):
+        # Unpack the data. Its structure depends on your model and
+        # on what you pass to `fit()`.
+        x, y, sample_weights = data
+
+        with tf.GradientTape() as tape:
+            y_pred = self(x, training=True)  # Forward pass
+
+            #regression losses computed only for correctly classified particles
+            pred_cls = tf.argmax(y_pred["cls"], axis=-1)
+            true_cls = tf.argmax(y["cls"], axis=-1)
+            msk_loss = tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32)
+            sample_weights["energy"] *= msk_loss
+            sample_weights["pt"] *= msk_loss
+            sample_weights["eta"] *= msk_loss
+            sample_weights["sin_phi"] *= msk_loss
+            sample_weights["cos_phi"] *= msk_loss
+
+            loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+
+        # Compute gradients
+        trainable_vars = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        # Update weights
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+        # Update metrics (includes the metric that tracks the loss)
+        self.compiled_metrics.update_state(y, y_pred)
+        # Return a dict mapping metric names to current value
+        return {m.name: m.result() for m in self.metrics}
 
 class DummyNet(tf.keras.Model):
     def __init__(self,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 6d0a62789..f8277d6c9 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -215,13 +215,7 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         vals_pred = ypred[reg_variable][msk][sel].flatten()
         vals_true = self.ytrue[reg_variable][msk][sel].flatten()
 
-        #FIXME: propagate from configuration
-        if reg_variable == "energy" or reg_variable == "pt":
-            delta = 1.0
-        else:
-            delta = 0.1
-            
-        loss = tf.keras.losses.Huber(delta=delta, reduction=tf.keras.losses.Reduction.NONE)
+        loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
         loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
 
         #suffix for log-transformed variable
@@ -392,19 +386,12 @@ def make_model(config, dtype):
 def make_gnn_dense(config, dtype):
 
     parameters = [
-        "layernorm",
-        "hidden_dim",
-        "bin_size",
-        "num_node_messages",
-        "num_graph_layers",
+        "num_graph_layers_common",
         "num_graph_layers_energy",
-        "distance_dim",
-        "dropout",
         "input_encoding",
-        "graph_kernel",
         "skip_connection",
-        "node_message",
         "output_decoding",
+        "combined_graph_layer",
         "debug"
     ]
 
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index c34562ccc..056c05e9f 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -153,7 +153,7 @@ def compute_weights_none(X, y, w):
 def make_weight_function(config):
     def weight_func(X,y,w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
         w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
 
         w_none = tf.ones_like(w)
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index 2ec37eb0d..abecfb13f 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -80,7 +80,7 @@
     "ygens = []\n",
     "ycands = []\n",
     "\n",
-    "for fi in dataset_def.val_filelist[:100]:\n",
+    "for fi in dataset_def.val_filelist[:2]:\n",
     "    print(fi)\n",
     "    X, ygen, ycand = dataset_def.prepare_data(fi)\n",
     "\n",
@@ -102,8 +102,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cls_cand = np.argmax(ycand_val[\"cls\"], axis=-1)\n",
-    "cls_gen = np.argmax(ygen_val[\"cls\"], axis=-1)"
+    "ret = model(X_val[:1])\n",
+    "#model.set_trainable_classification()\n",
+    "model.load_weights(\"/home/joosep/particleflow/experiments/cms_20210828_144012_433706.joosep-desktop//weights/weights-03-28.697701.hdf5\")\n",
+    "ret = model.predict(X_val, batch_size=1, verbose=1)"
    ]
   },
   {
@@ -112,7 +114,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sklearn"
+    "def get_bin_index(bs):\n",
+    "    bin_index = []\n",
+    "\n",
+    "    for ielem in range(6400):\n",
+    "        if X_val[0, ielem, 0] != 0:\n",
+    "            for ibin in range(bs.shape[0]):\n",
+    "                if ielem in bs[ibin]:\n",
+    "                    bin_index.append(ibin)\n",
+    "                    break\n",
+    "        else:\n",
+    "            break\n",
+    "    return bin_index"
    ]
   },
   {
@@ -121,7 +134,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cm = sklearn.metrics.confusion_matrix(cls_gen[X_val[:, :, 0]!=0], cls_cand[X_val[:, :, 0]!=0])"
+    "def plot_binning_in_layer(layer_name):\n",
+    "    msk = X_val[0][:, 0] != 0\n",
+    "    eta = X_val[0][msk, 2]\n",
+    "    phi = X_val[0][msk, 3]\n",
+    "    typ = X_val[0][msk, 0]\n",
+    "    energy = X_val[0][msk, 4]\n",
+    "\n",
+    "    evenly_spaced_interval = np.linspace(0, 1, ret[layer_name][\"bins\"].shape[1])\n",
+    "    colorlist = [cm.Dark2(x) for x in evenly_spaced_interval]\n",
+    "    bin_idx = get_bin_index(ret[layer_name][\"bins\"][0])\n",
+    "\n",
+    "    plt.figure(figsize=(4,4))\n",
+    "    plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\", s=energy)\n",
+    "    plt.xlabel(\"eta\")\n",
+    "    plt.ylabel(\"phi\")\n",
+    "    plt.title(\"Binning in {}\".format(layer_name))\n",
+    "    plt.savefig(\"bins_{}.pdf\".format(layer_name))"
    ]
   },
   {
@@ -130,9 +159,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cls_id = 5\n",
-    "ngen = np.sum(cls_gen==cls_id, axis=1)\n",
-    "ncand = np.sum(cls_cand==cls_id, axis=1)"
+    "plot_binning_in_layer(\"cg_0\")"
    ]
   },
   {
@@ -141,7 +168,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.scatter(ngen, ncand)"
+    "plot_binning_in_layer(\"cg_1\")"
    ]
   },
   {
@@ -150,10 +177,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cls_id = 4\n",
-    "variable = \"sin_phi\"\n",
-    "gen_energy = ygen_val[variable][(cls_cand==cls_id) & (cls_gen==cls_id)][:, 0].numpy()\n",
-    "cand_energy = ycand_val[variable][(cls_cand==cls_id) & (cls_gen==cls_id)][:, 0].numpy()"
+    "plot_binning_in_layer(\"cg_2\")"
    ]
   },
   {
@@ -162,10 +186,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "residual = gen_energy-cand_energy\n",
-    "plt.hist(residual, bins=100);\n",
-    "plt.xlabel(\"gen - PF\")\n",
-    "print(np.mean(residual), np.std(residual))"
+    "plot_binning_in_layer(\"cg_energy_0\")"
    ]
   },
   {
@@ -174,10 +195,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ret = model(X_val[:1])\n",
-    "#model.set_trainable_classification()\n",
-    "model.load_weights(\"/home/joosep/particleflow/experiments/cms_20210827_131712.joosep-desktop/weights/weights-07-477.625885.hdf5\")\n",
-    "ret = model.predict(X_val, batch_size=1, verbose=1)"
+    "plot_binning_in_layer(\"cg_energy_1\")"
    ]
   },
   {
@@ -186,8 +204,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "x = X_val[0]\n",
-    "msk = x[:, 0] == 8"
+    "plot_binning_in_layer(\"cg_energy_2\")"
    ]
   },
   {
@@ -196,7 +213,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.output_dec.classwise_energy_means"
+    "def plot_dms(dms):\n",
+    "    fig = plt.figure(figsize=(4*4, 3*4))\n",
+    "    for i in range(len(dms)):\n",
+    "        ax = plt.subplot(4,4,i+1)\n",
+    "        plt.axes(ax)\n",
+    "        plt.imshow(dms[i], interpolation=\"none\", norm=matplotlib.colors.Normalize(vmin=0, vmax=1), cmap=\"Blues\")\n",
+    "        plt.colorbar()\n",
+    "        plt.title(\"bin {}\".format(i))\n",
+    "        #plt.xlabel(\"elem index $i$\")\n",
+    "        #plt.ylabel(\"elem index $j$\")\n",
+    "    plt.tight_layout()"
    ]
   },
   {
@@ -205,7 +232,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.output_dec.classwise_energy_stds"
+    "for layer in ['cg_0', 'cg_1', 'cg_2']:\n",
+    "    dm_vals = ret[layer]['dm'].flatten()\n",
+    "    plt.hist(dm_vals[dm_vals!=0], bins=np.linspace(0,1,100), density=True, alpha=0.8, lw=2)"
    ]
   },
   {
@@ -214,22 +243,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cls = np.argmax(ret[\"cls\"], axis=-1)\n",
-    "cls_true = np.argmax(ycand_val[\"cls\"], axis=-1)\n",
-    "energy = ret[\"energy\"]\n",
-    "eta = ret[\"eta\"]\n",
-    "energy_true = ycand_val[\"energy\"]\n",
-    "\n",
-    "msk = (cls==4) & (cls_true==4)"
+    "for layer in ['cg_energy_0', 'cg_energy_1', 'cg_energy_2']:\n",
+    "    dm_vals = ret[layer]['dm'].flatten()\n",
+    "    plt.hist(dm_vals[dm_vals!=0], bins=np.linspace(0,1,100), density=True, alpha=0.8, lw=2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
-    "plt.hist(energy[msk].flatten()-energy_true[msk].flatten(), bins=100);"
+    "dmn = ret['cg_0']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_0\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_0.pdf\")"
    ]
   },
   {
@@ -238,7 +268,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_val[msk][:, 0]"
+    "dmn = ret['cg_1']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_1\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_1.pdf\")"
    ]
   },
   {
@@ -247,7 +280,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.scatter(eta[msk], energy[msk].flatten(), marker=\".\")"
+    "dmn = ret['cg_2']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_2\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_2.pdf\")"
    ]
   },
   {
@@ -256,18 +292,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_bin_index(bs):\n",
-    "    bin_index = []\n",
-    "\n",
-    "    for ielem in range(6400):\n",
-    "        if X_val[0, ielem, 0] != 0:\n",
-    "            for ibin in range(bs.shape[0]):\n",
-    "                if ielem in bs[ibin]:\n",
-    "                    bin_index.append(ibin)\n",
-    "                    break\n",
-    "        else:\n",
-    "            break\n",
-    "    return bin_index"
+    "dmn = ret['cg_energy_0']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_energy_0\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_energy_0.pdf\")"
    ]
   },
   {
@@ -276,7 +304,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "preds.keys()"
+    "dmn = ret['cg_energy_1']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_energy_1\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_energy_1.pdf\")"
    ]
   },
   {
@@ -285,7 +316,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.cg_id[0].name"
+    "dmn = ret['cg_energy_2']['dm'][0, :, :, :, 0]\n",
+    "plot_dms(dmn)\n",
+    "plt.suptitle(\"Learned adjacency, cg_energy_2\", y=1.01)\n",
+    "plt.savefig(\"dm_cg_energy_2.pdf\")"
    ]
   },
   {
@@ -294,7 +328,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "preds.keys()"
+    "msk = X_val[0][:, 0]!=0\n",
+    "sel = ret['dec_output'][0][msk]"
    ]
   },
   {
@@ -303,7 +338,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dd = preds[\"dec_output_id\"][0, :, 50:].numpy().flatten()"
+    "plt.scatter(sel[:, 40], sel[:, 60], marker=\".\")"
    ]
   },
   {
@@ -312,7 +347,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(dd, bins=100);"
+    "np.array(X_val[:1, :, 0]!=0, np.float32)"
    ]
   },
   {
@@ -321,9 +356,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.figure(figsize=(10,10))\n",
-    "plt.imshow(preds[\"dec_output_reg\"][0, :, 50:], cmap=\"Blues\")\n",
-    "plt.colorbar()"
+    "ret['dec_output_energy'].shape"
    ]
   },
   {
@@ -332,7 +365,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.ffn_momentum[4].summary()"
+    "pred_debug1 = model.output_dec([\n",
+    "    X_val,\n",
+    "    ret['dec_output'],\n",
+    "    ret['dec_output_energy'],\n",
+    "    np.array(X_val[:, :, 0:1]!=0, np.float32)],\n",
+    "    training=False\n",
+    ")"
    ]
   },
   {
@@ -341,22 +380,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "msk = X_val[0][:, 0] != 0\n",
-    "eta = X_val[0][msk, 2]\n",
-    "phi = X_val[0][msk, 3]\n",
-    "typ = X_val[0][msk, 0]\n",
-    "energy = X_val[0][msk, 4]\n",
-    "\n",
-    "evenly_spaced_interval = np.linspace(0, 1, preds[\"combined_graph_layer\"][\"bins\"].shape[1])\n",
-    "colorlist = [cm.rainbow(x) for x in evenly_spaced_interval]\n",
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer\"][\"bins\"][0].numpy())\n",
-    "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in classification layer 1\")\n",
-    "plt.savefig(\"bins_cls_layer1.pdf\")"
+    "true_id = np.argmax(ycand_val[\"cls\"], axis=-1)\n",
+    "pred_id1 = np.argmax(pred_debug1[\"cls\"], axis=-1)"
    ]
   },
   {
@@ -365,48 +390,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "evenly_spaced_interval = np.linspace(0, 1,  preds[\"combined_graph_layer_1\"][\"bins\"].shape[1])\n",
-    "colorlist = [cm.rainbow(x) for x in evenly_spaced_interval]\n",
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer_1\"][\"bins\"][0].numpy())\n",
-    "\n",
     "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in classification layer 2\")\n",
-    "plt.savefig(\"bins_cls_layer2.pdf\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer_2\"][\"bins\"][0].numpy())\n",
+    "msk1 = (X_val[:, :, 0]!=0) & (true_id==2)\n",
+    "plt.scatter(\n",
+    "    pred_debug1[\"energy\"][msk1][:, 0].numpy(),\n",
+    "    ycand_val[\"energy\"][msk1][:, 0].numpy(),\n",
+    "    marker=\".\", alpha=0.4\n",
+    ")\n",
     "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in regression layer 1\")\n",
-    "plt.savefig(\"bins_reg_layer1.pdf\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bin_idx = get_bin_index(preds[\"combined_graph_layer_3\"][\"bins\"][0].numpy())\n",
+    "#plt.plot([-1,1], [-1,1], color=\"black\")\n",
     "\n",
-    "plt.figure(figsize=(4,4))\n",
-    "plt.scatter(eta, phi, c=[colorlist[bi] for bi in bin_idx], marker=\".\")\n",
-    "plt.xlabel(\"eta\")\n",
-    "plt.ylabel(\"phi\")\n",
-    "plt.title(\"Binning in regression layer 1\")\n",
-    "plt.savefig(\"bins_reg_layer2.pdf\")"
+    "plt.plot([0,6], [0,6], color=\"black\")"
    ]
   },
   {
@@ -415,35 +409,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_dms(dms):\n",
-    "    fig = plt.figure(figsize=(4*4, 3*4))\n",
-    "    for i in range(25):\n",
-    "        ax = plt.subplot(5,5,i+1)\n",
-    "        plt.axes(ax)\n",
-    "        plt.imshow(dmn[i], interpolation=\"none\", norm=matplotlib.colors.Normalize(vmin=0, vmax=1), cmap=\"Blues\")\n",
-    "        plt.colorbar()\n",
-    "        plt.title(\"bin {}\".format(i))\n",
-    "        #plt.xlabel(\"elem index $i$\")\n",
-    "        #plt.ylabel(\"elem index $j$\")\n",
-    "    plt.tight_layout()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dmnf = dmn.flatten()"
+    "model.cg[0].trainable = False\n",
+    "model.cg[1].trainable = False\n",
+    "model.cg[2].trainable = False\n",
+    "\n",
+    "# model.cg_energy[0].trainable = False\n",
+    "# model.cg_energy[1].trainable = False\n",
+    "# model.cg_energy[2].trainable = False\n",
+    "\n",
+    "model.output_dec.ffn_id.trainable = False\n",
+    "model.output_dec.ffn_charge.trainable = False\n",
+    "model.output_dec.ffn_phi.trainable = False\n",
+    "model.output_dec.ffn_eta.trainable = False\n",
+    "model.output_dec.ffn_pt.trainable = False\n",
+    "model.output_dec.ffn_energy.trainable = True\n",
+    "\n",
+    "model.output_dec.layernorm.trainable = False"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
-    "plt.hist(dmnf[dmnf!=0], bins=100);"
+    "[w.name for w in model.trainable_weights]"
    ]
   },
   {
@@ -452,22 +444,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(dmn[1])\n",
-    "plt.colorbar()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "dmn = preds[\"combined_graph_layer\"][\"dm\"][0].numpy()\n",
-    "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, classification layer 1\", y=1.01)\n",
-    "plt.savefig(\"dm_cls1.pdf\")"
+    "class_weights = tf.constant([0.0, 0.01, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0])"
    ]
   },
   {
@@ -478,10 +455,21 @@
    },
    "outputs": [],
    "source": [
-    "dmn = preds[\"combined_graph_layer_1\"][\"dm\"][0].numpy()\n",
-    "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, classification layer 2\", y=1.01)\n",
-    "plt.savefig(\"dm_cls2.pdf\")"
+    "loss = tf.keras.losses.Huber()\n",
+    "optimizer = tf.keras.optimizers.Adam(lr=1e-4)\n",
+    "for epoch in range(100):\n",
+    "    with tf.GradientTape() as tape:\n",
+    "        y_pred = model(X_val[:2], training=True)\n",
+    "        pred_cls = tf.argmax(y_pred[\"cls\"], axis=-1)\n",
+    "        true_cls = tf.argmax(ycand_val[\"cls\"][:2], axis=-1)\n",
+    "        msk_loss = tf.expand_dims(tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32), axis=-1)\n",
+    "        sample_weights = tf.keras.activations.softmax(ycand_val[\"cls\"][:2]*100)*class_weights\n",
+    "        sample_weights = tf.reduce_sum(class_weights, axis=-1, keepdims=True)\n",
+    "        loss_val = loss(ycand_val[\"energy\"][:2]*msk_loss, y_pred[\"energy\"][:2]*msk_loss, sample_weight=sample_weights)\n",
+    "        print(loss_val)\n",
+    "    trainable_vars = model.trainable_variables\n",
+    "    gradients = tape.gradient(loss_val, trainable_vars)\n",
+    "    optimizer.apply_gradients(zip(gradients, trainable_vars))\n"
    ]
   },
   {
@@ -490,22 +478,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dmn = preds[\"combined_graph_layer_2\"][\"dm\"][0].numpy()\n",
-    "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, regression layer 1\", y=1.01)\n",
-    "plt.savefig(\"dm_reg1.pdf\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dmn = preds[\"combined_graph_layer_3\"][\"dm\"][0].numpy()\n",
-    "plot_dms(dmn)\n",
-    "plt.suptitle(\"Learned adjacency, regression layer 2\", y=1.01)\n",
-    "plt.savefig(\"dm_reg2.pdf\")"
+    "y_pred = model(X_val[2:6], training=False)\n",
+    "\n",
+    "true_id = tf.argmax(ycand_val[\"cls\"][2:6], axis=-1)\n",
+    "pred_id = tf.argmax(y_pred[\"cls\"], axis=-1)"
    ]
   },
   {
@@ -514,8 +490,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "arr = tf.random.normal((2,160,40,40,32))\n",
-    "msk = tf.cast(tf.random.normal((2,160,40,))>0.5, tf.float32)"
+    "sklearn.metrics.confusion_matrix(true_id.numpy().flatten(), pred_id.numpy().flatten())"
    ]
   },
   {
@@ -524,7 +499,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(arr[0, 0, :, :, 0])"
+    "plt.figure(figsize=(4,4))\n",
+    "cls = 3\n",
+    "print(np.sum((true_id==cls) & (pred_id==cls)))\n",
+    "plt.scatter(\n",
+    "    y_pred[\"energy\"][(true_id==cls) & (pred_id==cls)],\n",
+    "    ycand_val[\"energy\"][2:6][(true_id==cls) & (pred_id==cls)],\n",
+    "    marker=\".\"\n",
+    ")\n",
+    "plt.plot([0,6], [0,6], color=\"black\")\n",
+    "plt.xlim(0,6)\n",
+    "plt.ylim(0,6)"
    ]
   },
   {
@@ -533,7 +518,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(tf.einsum(\"abijk,abi->abijk\", arr, msk)[0,0, :, :, 0])"
+    "vals = y_pred[\"energy\"][(true_id!=0)] - ycand_val[\"energy\"][2:6][(true_id!=0)]"
    ]
   },
   {
@@ -542,7 +527,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(tf.einsum(\"abijk,abj->abijk\", arr, msk)[0,0,:, :, 0])"
+    "plt.hist(vals.numpy().flatten(), bins=np.linspace(-2,2,100));\n",
+    "plt.yscale(\"log\")"
    ]
   },
   {
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index aa9eb7982..e56553b7a 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -80,40 +80,42 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
-  activation: gelu
-  layernorm: no
-  hidden_dim: 256
-  bin_size: 320
-  distance_dim: 128
-  dropout: 0.0
-  graph_kernel:
-    type: NodePairGaussianKernel
-    dist_mult: 0.1
-    clip_value_low: 0.0
-  num_graph_layers: 3
-  num_graph_layers_energy: 3
-  node_message:
-    type: GHConvDense
-    output_dim: 256
+  combined_graph_layer:
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.2
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 2
+    node_message:
+      type: GHConvDense
+      output_dim: 512
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 256
     activation: gelu
-    normalize_degrees: yes
-  num_node_messages: 1
+  num_graph_layers_common: 1
+  num_graph_layers_energy: 1
   output_decoding:
     activation: gelu
     regression_use_classification: yes
-    dropout: 0.0
+    dropout: 0.2
 
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: no
+    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
     pt_dim_decrease: yes
     eta_dim_decrease: yes
     phi_dim_decrease: yes
-    energy_dim_decrease: yes
+    energy_dim_decrease: no
 
     id_hidden_dim: 256
     charge_hidden_dim: 256
@@ -122,13 +124,13 @@ parameters:
     phi_hidden_dim: 256
     energy_hidden_dim: 1024
 
-    id_num_layers: 4
+    id_num_layers: 3
     charge_num_layers: 2
     pt_num_layers: 3
     eta_num_layers: 3
     phi_num_layers: 3
     energy_num_layers: 3
-    layernorm: no
+    layernorm: yes
     mask_reg_cls0: no
 
   skip_connection: yes
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index c19cfc191..dcc2df24d 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -67,25 +67,27 @@ sample_weights:
 
 parameters:
   model: gnn_dense
-  input_encoding: default
-  activation: gelu
-  layernorm: yes
-  hidden_dim: 256
-  bin_size: 320
-  distance_dim: 128
-  dropout: 0.0
-  graph_kernel:
-    type: NodePairGaussianKernel
-    dist_mult: 0.1
-    clip_value_low: 0.0
-  num_graph_layers: 5
-  num_graph_layers_energy: 1
-  node_message:
-    type: GHConvDense
-    output_dim: 256
-    activation: elu
-    normalize_degrees: yes
-  num_node_messages: 1
+  input_encoding: cms
+  combined_graph_layer:
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    num_node_messages: 1
+    dropout: 0.0
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    node_message:
+      type: GHConvDense
+      output_dim: 256
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
   output_decoding:
     activation: gelu
     regression_use_classification: yes

From 115419af90791ab30df672a12d9d1b009ad832a6 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 29 Aug 2021 09:24:53 +0300
Subject: [PATCH 130/157] up

---
 mlpf/tfmodel/model.py       | 14 +++++++-------
 mlpf/tfmodel/model_setup.py | 13 +++++++++++++
 parameters/cms.yaml         | 34 +++++++++++++++++-----------------
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index d183a25a7..4130b3612 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -535,18 +535,18 @@ def call(self, args, training=False):
 
         if self.eta_skip_gate:
             eta_gate = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
-            pred_eta = orig_eta + eta_gate*pred_eta_corr[:, :, 1:2]
+            pred_eta = orig_eta + pred_eta_corr[:, :, 1:2]
         else:
-            pred_eta = orig_eta*pred_eta_corr[:, :, 0:1] + eta_gate*pred_eta_corr[:, :, 1:2]
+            pred_eta = orig_eta*pred_eta_corr[:, :, 0:1] + pred_eta_corr[:, :, 1:2]
         
         if self.phi_skip_gate:
             sin_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
             cos_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
-            pred_sin_phi = orig_sin_phi + sin_phi_gate*pred_phi_corr[:, :, 1:2]
-            pred_cos_phi = orig_cos_phi + cos_phi_gate*pred_phi_corr[:, :, 3:4]
+            pred_sin_phi = orig_sin_phi + pred_phi_corr[:, :, 1:2]
+            pred_cos_phi = orig_cos_phi + pred_phi_corr[:, :, 3:4]
         else:
-            pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + sin_phi_gate*pred_phi_corr[:, :, 1:2]
-            pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + cos_phi_gate*pred_phi_corr[:, :, 3:4]
+            pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + pred_phi_corr[:, :, 1:2]
+            pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + pred_phi_corr[:, :, 3:4]
 
         if self.regression_use_classification:
             X_encoded_energy = tf.concat([X_encoded_energy, tf.stop_gradient(out_id_logits)], axis=-1)
@@ -575,7 +575,7 @@ def call(self, args, training=False):
             pt_gate = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
             pred_log_pt = orig_log_pt + pt_gate*pred_pt_corr[:, :, 1:2]
         else:
-            pred_log_pt = orig_log_pt*pred_pt_corr[:, :, 0:1] + pt_gate*pred_pt_corr[:, :, 1:2]
+            pred_log_pt = orig_log_pt*pred_pt_corr[:, :, 0:1] + pred_pt_corr[:, :, 1:2]
         
         if self.mask_reg_cls0:
             msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_hard_softmax, axis=-1)!=0, tf.float32), axis=-1)
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index f8277d6c9..fa33ac726 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -532,6 +532,19 @@ def configure_model_weights(model, trainable_layers):
 
     if trainable_layers == "all":
         model.trainable = True
+    elif trainable_layers == "regression":
+        for cg in model.cg:
+            cg.trainable = False
+        for cg in model.cg_energy:
+            cg.trainable = True
+
+        model.output_dec.ffn_id.trainable = False
+        model.output_dec.ffn_charge.trainable = False
+        model.output_dec.ffn_phi.trainable = True
+        model.output_dec.ffn_eta.trainable = True
+        model.output_dec.ffn_pt.trainable = True
+        model.output_dec.ffn_energy.trainable = True
+        model.output_dec.layernorm.trainable = False
     else:
         if isinstance(trainable_layers, str):
             trainable_layers = [trainable_layers]
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index e56553b7a..75c963b51 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -29,7 +29,7 @@ dataset:
   eta_loss_coef: 100.0
   sin_phi_loss_coef: 100.0
   cos_phi_loss_coef: 100.0
-  energy_loss_coef: 100.0
+  energy_loss_coef: 1.0
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
@@ -57,11 +57,11 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 40
+  num_epochs: 20
   num_val_files: 10
   dtype: float32
   trainable:
@@ -85,30 +85,30 @@ parameters:
     max_num_bins: 100
     distance_dim: 128
     layernorm: no
-    dropout: 0.2
+    dropout: 0.0
     kernel:
       type: NodePairGaussianKernel
       dist_mult: 0.1
       clip_value_low: 0.0
-    num_node_messages: 2
+    num_node_messages: 1
     node_message:
       type: GHConvDense
-      output_dim: 512
+      output_dim: 256
       activation: gelu
       normalize_degrees: yes
     hidden_dim: 256
     activation: gelu
-  num_graph_layers_common: 1
-  num_graph_layers_energy: 1
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
   output_decoding:
     activation: gelu
     regression_use_classification: yes
-    dropout: 0.2
+    dropout: 0.0
 
-    pt_skip_gate: yes
-    eta_skip_gate: yes
-    phi_skip_gate: yes
-    energy_skip_gate: yes
+    pt_skip_gate: no
+    eta_skip_gate: no
+    phi_skip_gate: no
+    energy_skip_gate: no
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
@@ -117,7 +117,7 @@ parameters:
     phi_dim_decrease: yes
     energy_dim_decrease: no
 
-    id_hidden_dim: 256
+    id_hidden_dim: 1024
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
@@ -130,7 +130,7 @@ parameters:
     eta_num_layers: 3
     phi_num_layers: 3
     energy_num_layers: 3
-    layernorm: yes
+    layernorm: no
     mask_reg_cls0: no
 
   skip_connection: yes
@@ -141,6 +141,6 @@ timing:
   num_iter: 3
 
 exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
+  decay_steps: 1000
+  decay_rate: 0.98
   staircase: yes

From 53cec231d495d07bef71730f9be2b530f5a7155a Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Sun, 29 Aug 2021 19:41:07 +0300
Subject: [PATCH 131/157] option to split energy regression classwise

---
 mlpf/tfmodel/model.py       | 98 ++++++++++++++++++++++++-------------
 mlpf/tfmodel/model_setup.py |  8 +--
 notebooks/pfnet-debug.ipynb | 25 ++++++++++
 parameters/cms.yaml         | 37 +++++++-------
 parameters/delphes.yaml     |  4 +-
 5 files changed, 112 insertions(+), 60 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 4130b3612..3f086bb87 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -428,6 +428,7 @@ def __init__(self,
 
         layernorm=False,
         mask_reg_cls0=True,
+        classwise_split_energy=False,
         **kwargs):
 
         super(OutputDecoding, self).__init__(**kwargs)
@@ -440,6 +441,7 @@ def __init__(self,
         self.eta_skip_gate = eta_skip_gate
         self.phi_skip_gate = phi_skip_gate
         self.energy_skip_gate = energy_skip_gate
+        self.classwise_split_energy = classwise_split_energy
 
         self.mask_reg_cls0 = mask_reg_cls0
 
@@ -484,17 +486,21 @@ def __init__(self,
             dropout=dropout
         )
 
-        self.ffn_energy = point_wise_feed_forward_network(
-            4, energy_hidden_dim, "ffn_energy",
-            dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
-            dropout=dropout
-        )
-
-        if not self.energy_skip_gate:
-            self.classwise_energy_means = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_means",
-                initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.1), trainable=True)
-            self.classwise_energy_stds = self.add_weight(shape=(num_output_classes, ), name="classwise_energy_stds",
-                initializer=tf.keras.initializers.RandomNormal(mean=1, stddev=0.1), trainable=True)
+        num_energy_out = 1
+        if self.energy_skip_gate:
+            num_energy_out = 2
+
+        if self.classwise_split_energy:
+            self.ffn_energy = [point_wise_feed_forward_network(
+                num_energy_out, energy_hidden_dim, "ffn_energy_cls{}".format(icls),
+                dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
+                dropout=dropout
+            ) for icls in range(1, num_output_classes)]
+        else:
+            self.ffn_energy = point_wise_feed_forward_network(
+                num_energy_out, energy_hidden_dim, "ffn_energy",
+                dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
+                dropout=dropout)
 
     """
     X_input: (n_batch, n_elements, n_input_features) raw node input features
@@ -548,21 +554,24 @@ def call(self, args, training=False):
             pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + pred_phi_corr[:, :, 1:2]
             pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + pred_phi_corr[:, :, 3:4]
 
+        X_encoded_energy = tf.concat([X_encoded, X_encoded_energy], axis=-1)
         if self.regression_use_classification:
             X_encoded_energy = tf.concat([X_encoded_energy, tf.stop_gradient(out_id_logits)], axis=-1)
 
-        pred_energy_corr = self.ffn_energy(X_encoded_energy, training=training)*msk_input
+        if self.classwise_split_energy:
+            pred_energy_corr = tf.stack([ffn(X_encoded_energy, training=training)for ffn in self.ffn_energy], axis=-1)
+            pred_log_energy0 = tf.reduce_sum(out_id_hard_softmax[:, :, 1:]*pred_energy_corr[:, :, 0, :], axis=-1, keepdims=True)*msk_input
+        else:
+            pred_log_energy0 = self.ffn_energy(X_encoded_energy, training=training)*msk_input
+
         pred_pt_corr = self.ffn_pt(X_encoded_energy, training=training)*msk_input
 
         if self.energy_skip_gate:
-            energy_gate = tf.keras.activations.sigmoid(pred_energy_corr[:, :, 0:1])
-            energy_corr = energy_gate*pred_energy_corr[:, :, 1:2]
+            energy_gate = tf.keras.activations.sigmoid(pred_log_energy0)
+            energy_corr = energy_gate*pred_log_energy1
             pred_log_energy = orig_log_energy + energy_corr
         else:
-            #pred_log_energy = orig_log_energy*pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]
-            pred_log_energy = pred_energy_corr[:, :, 0:1] + pred_energy_corr[:, :, 1:2]*orig_log_energy + pred_energy_corr[:, :, 2:3]*orig_log_energy*orig_log_energy + pred_energy_corr[:, :, 3:4]*tf.math.sqrt(orig_log_energy)
-            pred_log_energy = pred_log_energy - tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_means, axis=-1, keepdims=True)
-            pred_log_energy = pred_log_energy / tf.reduce_sum(out_id_hard_softmax*self.classwise_energy_stds, axis=-1, keepdims=True)
+            pred_log_energy = pred_log_energy0
 
         #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
         pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
@@ -598,20 +607,18 @@ def call(self, args, training=False):
 
         return ret
 
-    def set_trainable_named(self, layer_names):
-        self.trainable = True
-
-        for layer in self.layers:
-            layer.trainable = False
+    def set_trainable_regression(self):
+        self.ffn_id.trainable = False
+        self.ffn_charge.trainable = False
+        self.ffn_phi.trainable = True
+        self.ffn_eta.trainable = True
+        self.ffn_pt.trainable = True
 
-        layer_names = [l.name for l in self.layers]
-        for layer in layer_names:
-            if layer in layer_names:
-                #it's a layer
-                self.get_layer(layer).trainable = True
-            else:
-                #it's a weight
-                getattr(self, layer).trainable = True
+        if self.classwise_split_energy:
+            for layer in self.ffn_energy:
+                layer.trainable = True
+        else:
+            self.ffn_energy.trainable = True
 
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
@@ -726,6 +733,8 @@ def call(self, inputs, training=False):
 
         enc_cg = enc
         encs = []
+        if self.skip_connection:
+            encs.append(enc)
         for cg in self.cg:
             enc_all = cg(enc_cg, msk, training)
             enc_cg = enc_all["enc"]
@@ -734,8 +743,6 @@ def call(self, inputs, training=False):
             encs.append(enc_cg)
 
         dec_input = []
-        if self.skip_connection:
-            dec_input.append(enc)
         dec_input += encs
         dec_output = tf.concat(dec_input, axis=-1)*msk_input
         if self.debug:
@@ -743,6 +750,8 @@ def call(self, inputs, training=False):
 
         enc_cg = enc
         encs_energy = []
+        if self.skip_connection:
+            encs_energy.append(enc)
         for cg in self.cg_energy:
             enc_all = cg(enc_cg, msk, training)
             enc_cg = enc_all["enc"]
@@ -803,6 +812,29 @@ def train_step(self, data):
         # Return a dict mapping metric names to current value
         return {m.name: m.result() for m in self.metrics}
 
+    def test_step(self, data):
+        # Unpack the data
+        x, y, sample_weights = data
+        # Compute predictions
+        y_pred = self(x, training=False)
+
+        pred_cls = tf.argmax(y_pred["cls"], axis=-1)
+        true_cls = tf.argmax(y["cls"], axis=-1)
+        msk_loss = tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32)
+        sample_weights["energy"] *= msk_loss
+        sample_weights["pt"] *= msk_loss
+        sample_weights["eta"] *= msk_loss
+        sample_weights["sin_phi"] *= msk_loss
+        sample_weights["cos_phi"] *= msk_loss
+
+        # Updates the metrics tracking the loss
+        self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+        # Update the metrics.
+        self.compiled_metrics.update_state(y, y_pred)
+        # Return a dict mapping metric names to current value.
+        # Note that it will include the loss (tracked in self.metrics).
+        return {m.name: m.result() for m in self.metrics}
+
 class DummyNet(tf.keras.Model):
     def __init__(self,
                 num_input_classes=8,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index fa33ac726..014d59432 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -538,13 +538,7 @@ def configure_model_weights(model, trainable_layers):
         for cg in model.cg_energy:
             cg.trainable = True
 
-        model.output_dec.ffn_id.trainable = False
-        model.output_dec.ffn_charge.trainable = False
-        model.output_dec.ffn_phi.trainable = True
-        model.output_dec.ffn_eta.trainable = True
-        model.output_dec.ffn_pt.trainable = True
-        model.output_dec.ffn_energy.trainable = True
-        model.output_dec.layernorm.trainable = False
+        model.output_dec.set_trainable_regression()
     else:
         if isinstance(trainable_layers, str):
             trainable_layers = [trainable_layers]
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index abecfb13f..dacfdb213 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -96,6 +96,31 @@
     "X_val, ygen_val, _ = dataset_transform(X_val, ygen_val, None)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.std(ycand_val[\"energy\"][np.argmax(ycand_val[\"cls\"], axis=-1)==2].numpy().flatten())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist((ycand_val[\"energy\"][np.argmax(ycand_val[\"cls\"], axis=-1)==2].numpy().flatten()-1/59)/1.3, bins=100);"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 75c963b51..96f5494ca 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -24,22 +24,20 @@ dataset:
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
   classification_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 100.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.1
   eta_loss_coef: 100.0
-  sin_phi_loss_coef: 100.0
-  cos_phi_loss_coef: 100.0
-  energy_loss_coef: 1.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.1
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredError
   pt_loss:
-    type: Huber
-    delta: 1.0
+    type: MeanSquaredError
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -51,14 +49,14 @@ dataset:
     delta: 0.1
 
 tensorflow:
-  eager: no
+  eager: yes
 
 setup:
   train: yes
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 4
+  batch_size: 2
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 20
@@ -81,7 +79,7 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   combined_graph_layer:
-    bin_size: 640
+    bin_size: 320
     max_num_bins: 100
     distance_dim: 128
     layernorm: no
@@ -98,16 +96,16 @@ parameters:
       normalize_degrees: yes
     hidden_dim: 256
     activation: gelu
-  num_graph_layers_common: 2
-  num_graph_layers_energy: 2
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
   output_decoding:
     activation: gelu
     regression_use_classification: yes
     dropout: 0.0
 
     pt_skip_gate: no
-    eta_skip_gate: no
-    phi_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
     energy_skip_gate: no
 
     id_dim_decrease: yes
@@ -115,23 +113,24 @@ parameters:
     pt_dim_decrease: yes
     eta_dim_decrease: yes
     phi_dim_decrease: yes
-    energy_dim_decrease: no
+    energy_dim_decrease: yes
 
     id_hidden_dim: 1024
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 1024
+    energy_hidden_dim: 512
 
     id_num_layers: 3
     charge_num_layers: 2
     pt_num_layers: 3
     eta_num_layers: 3
     phi_num_layers: 3
-    energy_num_layers: 3
+    energy_num_layers: 4
     layernorm: no
     mask_reg_cls0: no
+    classwise_split_energy: no
 
   skip_connection: yes
   debug: no
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index dcc2df24d..ac359d2ca 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -67,7 +67,7 @@ sample_weights:
 
 parameters:
   model: gnn_dense
-  input_encoding: cms
+  input_encoding: default
   combined_graph_layer:
     bin_size: 640
     max_num_bins: 100
@@ -120,6 +120,8 @@ parameters:
     energy_num_layers: 3
     layernorm: yes
     mask_reg_cls0: yes
+    classwise_split_energy: no
+
   skip_connection: yes
   debug: no
 

From 47cd7a299a1de3837269185ab23c3c8fcbc08567 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Sun, 29 Aug 2021 19:49:00 +0300
Subject: [PATCH 132/157] fix

---
 mlpf/tfmodel/utils.py | 9 ---------
 parameters/cms.yaml   | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 056c05e9f..040792d76 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -196,15 +196,6 @@ def func(X, y, w):
 
     return func
 
-def classwise_energy_normalization(X,y,w):
-    mean_energies = tf.constant([1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.float32)
-
-    energy_sub = y["cls"]*mean_energies
-
-    import pdb;pdb.set_trace()
-
-    return X,y,w
-
 def get_dataset_def(config):
     cds = config["dataset"]
 
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 96f5494ca..67d2c3890 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -49,7 +49,7 @@ dataset:
     delta: 0.1
 
 tensorflow:
-  eager: yes
+  eager: no
 
 setup:
   train: yes

From d03a64d1968e898057d17db53d69b9350e661307 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Sun, 29 Aug 2021 19:51:35 +0300
Subject: [PATCH 133/157] fix

---
 mlpf/pipeline.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 34c6411b0..82b5aa0d5 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -49,8 +49,7 @@
     get_loss_dict,
     parse_config,
     get_best_checkpoint,
-    delete_all_but_best_checkpoint,
-    classwise_energy_normalization
+    delete_all_but_best_checkpoint
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -141,10 +140,6 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
 
-    # for X, y, w in ds_train_r:
-    #     classwise_energy_normalization(X,y,w)
-    #     break
-
     #FIXME: split up training/test and validation dataset and parameters
     dataset_def.padded_num_elem_size = 6400
 

From 6e25bb2fb18e09cb1144c7700014ee1d281def0e Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Mon, 30 Aug 2021 18:16:11 +0300
Subject: [PATCH 134/157] remove log

---
 mlpf/pipeline.py                           |   7 +-
 mlpf/tfmodel/model.py                      | 245 +++++++++++----------
 mlpf/tfmodel/model_setup.py                |  93 ++++----
 mlpf/tfmodel/utils.py                      |   8 +-
 notebooks/cms-mlpf.ipynb                   |  79 +++++--
 notebooks/delphes-tf-mlpf-quickstart.ipynb |  28 +++
 notebooks/pfnet-debug.ipynb                |  55 ++++-
 parameters/cms-dev.yaml                    |   3 +-
 parameters/cms-gen.yaml                    | 143 ------------
 parameters/cms.yaml                        |  34 ++-
 parameters/delphes.yaml                    |   4 +-
 11 files changed, 352 insertions(+), 347 deletions(-)
 delete mode 100644 parameters/cms-gen.yaml

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 82b5aa0d5..5e2c2ed38 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -190,6 +190,10 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
         configure_model_weights(model, config["setup"]["trainable"])
         model(tf.cast(X_val[:1], model_dtype))
 
+        print("trainable weights")
+        for w in model.trainable_weights:
+            print(w.name)
+
         loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
             loss=loss_dict,
@@ -214,7 +218,8 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
     if config["dataset"]["target_particles"] == "cand":
         validation_particles = ycand_val
     elif config["dataset"]["target_particles"] == "gen":
-        validation_particles = ycand_val
+        validation_particles = ygen_val
+
     callbacks = prepare_callbacks(
         model,
         outdir,
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 3f086bb87..25cc57a04 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -34,7 +34,7 @@ def pairwise_gaussian_dist(A, B):
     D = tf.sqrt(tf.maximum(na - 2*tf.matmul(A, B, False, True) + nb, 1e-6))
     return D
 
-def pairwise_learnable_dist(A, B, ffn):
+def pairwise_learnable_dist(A, B, ffn, training=False):
     shp = tf.shape(A)
 
     #stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
@@ -47,7 +47,7 @@ def pairwise_learnable_dist(A, B, ffn):
     ) #(batch, bin, elem, elem, feat)
 
     #run a feedforward net on (src, dst) -> 1
-    res_transformed = ffn(res)
+    res_transformed = ffn(res, training=training)
 
     return res_transformed
 
@@ -297,7 +297,7 @@ def __init__(self, clip_value_low=0.0, dist_mult=0.1, **kwargs):
 
     returns: (n_batch, n_bins, n_points, n_points, 1) message matrix
     """
-    def call(self, x_msg_binned):
+    def call(self, x_msg_binned, training=False):
         dm = tf.expand_dims(pairwise_gaussian_dist(x_msg_binned, x_msg_binned), axis=-1)
         dm = tf.exp(-self.dist_mult*dm)
         dm = tf.clip_by_value(dm, self.clip_value_low, 1)
@@ -325,8 +325,8 @@ def __init__(self, output_dim=32, hidden_dim=32, num_layers=2, activation="elu",
 
     returns: (n_batch, n_bins, n_points, n_points, output_dim) message matrix
     """
-    def call(self, x_msg_binned):
-        dm = pairwise_learnable_dist(x_msg_binned, x_msg_binned, self.ffn_kernel)
+    def call(self, x_msg_binned, training=False):
+        dm = pairwise_learnable_dist(x_msg_binned, x_msg_binned, self.ffn_kernel, training=training)
         dm = self.activation(dm)
         return dm
 
@@ -363,7 +363,7 @@ def build(self, input_shape):
     x_msg: (n_batch, n_points, n_msg_features)
     x_node: (n_batch, n_points, n_node_features)
     """
-    def call(self, x_msg, x_node, msk):
+    def call(self, x_msg, x_node, msk, training=False):
         msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
 
         shp = tf.shape(x_msg)
@@ -384,7 +384,7 @@ def call(self, x_msg, x_node, msk):
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
         #Run the node-to-node kernel (distance computation / graph building / attention)
-        dm = self.kernel(x_msg_binned)
+        dm = self.kernel(x_msg_binned, training=training)
 
         #remove the masked points row-wise and column-wise
         dm = tf.einsum("abijk,abi->abijk", dm, tf.squeeze(msk_f_binned, axis=-1))
@@ -428,7 +428,6 @@ def __init__(self,
 
         layernorm=False,
         mask_reg_cls0=True,
-        classwise_split_energy=False,
         **kwargs):
 
         super(OutputDecoding, self).__init__(**kwargs)
@@ -440,8 +439,6 @@ def __init__(self,
         self.pt_skip_gate = pt_skip_gate
         self.eta_skip_gate = eta_skip_gate
         self.phi_skip_gate = phi_skip_gate
-        self.energy_skip_gate = energy_skip_gate
-        self.classwise_split_energy = classwise_split_energy
 
         self.mask_reg_cls0 = mask_reg_cls0
 
@@ -486,21 +483,15 @@ def __init__(self,
             dropout=dropout
         )
 
-        num_energy_out = 1
-        if self.energy_skip_gate:
-            num_energy_out = 2
+        self.ffn_energy = point_wise_feed_forward_network(
+            num_output_classes, energy_hidden_dim, "ffn_energy",
+            dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
+            dropout=dropout)
 
-        if self.classwise_split_energy:
-            self.ffn_energy = [point_wise_feed_forward_network(
-                num_energy_out, energy_hidden_dim, "ffn_energy_cls{}".format(icls),
-                dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
-                dropout=dropout
-            ) for icls in range(1, num_output_classes)]
-        else:
-            self.ffn_energy = point_wise_feed_forward_network(
-                num_energy_out, energy_hidden_dim, "ffn_energy",
-                dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
-                dropout=dropout)
+        self.ffn_energy_classwise = point_wise_feed_forward_network(
+            1, energy_hidden_dim, "ffn_energy_classwise_shift",
+            dtype=tf.dtypes.float32, num_layers=energy_num_layers, activation=activation, dim_decrease=energy_dim_decrease,
+            dropout=dropout)
 
     """
     X_input: (n_batch, n_elements, n_input_features) raw node input features
@@ -516,8 +507,8 @@ def call(self, args, training=False):
 
         out_id_logits = self.ffn_id(X_encoded, training=training)*msk_input
 
-        out_id_softmax = tf.clip_by_value(tf.nn.softmax(out_id_logits, axis=-1), 0, 1)
-        out_id_hard_softmax = tf.clip_by_value(tf.stop_gradient(tf.nn.softmax(100*out_id_logits, axis=-1)), 0, 1)
+        out_id_softmax = tf.nn.softmax(out_id_logits, axis=-1)
+        out_id_hard_softmax = tf.stop_gradient(tf.nn.softmax(100*out_id_logits, axis=-1))
         out_charge = self.ffn_charge(X_encoded, training=training)*msk_input
 
         orig_eta = X_input[:, :, 2:3]
@@ -527,11 +518,11 @@ def call(self, args, training=False):
         if self.schema == "cms":
             orig_sin_phi = tf.math.sin(X_input[:, :, 3:4])*msk_input
             orig_cos_phi = tf.math.cos(X_input[:, :, 3:4])*msk_input
-            orig_log_energy = tf.math.log(X_input[:, :, 4:5] + 1.0)*msk_input
+            orig_energy = X_input[:, :, 4:5]*msk_input
         elif self.schema == "delphes":
             orig_sin_phi = X_input[:, :, 3:4]*msk_input
             orig_cos_phi = X_input[:, :, 4:5]*msk_input
-            orig_log_energy = tf.math.log(X_input[:, :, 5:6] + 1.0)*msk_input
+            orig_energy = X_input[:, :, 5:6]*msk_input
 
         if self.regression_use_classification:
             X_encoded = tf.concat([X_encoded, tf.stop_gradient(out_id_logits)], axis=-1)
@@ -558,51 +549,47 @@ def call(self, args, training=False):
         if self.regression_use_classification:
             X_encoded_energy = tf.concat([X_encoded_energy, tf.stop_gradient(out_id_logits)], axis=-1)
 
-        if self.classwise_split_energy:
-            pred_energy_corr = tf.stack([ffn(X_encoded_energy, training=training)for ffn in self.ffn_energy], axis=-1)
-            pred_log_energy0 = tf.reduce_sum(out_id_hard_softmax[:, :, 1:]*pred_energy_corr[:, :, 0, :], axis=-1, keepdims=True)*msk_input
-        else:
-            pred_log_energy0 = self.ffn_energy(X_encoded_energy, training=training)*msk_input
+        pred_energy_corr = self.ffn_energy(X_encoded_energy, training=training)*msk_input
+        pred_energy = tf.reduce_sum(out_id_hard_softmax*pred_energy_corr, axis=-1, keepdims=True)
 
-        pred_pt_corr = self.ffn_pt(X_encoded_energy, training=training)*msk_input
+        pred_energy += tf.reduce_sum(
+            out_id_hard_softmax*self.ffn_energy_classwise(
+                tf.concat([orig_energy, tf.stop_gradient(out_id_logits)], axis=-1), training=training),
+            axis=-1, keepdims=True)
 
-        if self.energy_skip_gate:
-            energy_gate = tf.keras.activations.sigmoid(pred_log_energy0)
-            energy_corr = energy_gate*pred_log_energy1
-            pred_log_energy = orig_log_energy + energy_corr
-        else:
-            pred_log_energy = pred_log_energy0
+        pred_energy = tf.math.exp(tf.clip_by_value(pred_energy, -3, 7))
 
         #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
-        pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
+        #pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
 
         #compute pt=E/cosh(eta)
         orig_pt = tf.stop_gradient(pred_energy/tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
-        orig_log_pt = tf.math.log(orig_pt + 1.0)
+        #orig_log_pt = tf.math.log(orig_pt + 1.0)
 
+        pred_pt_corr = self.ffn_pt(X_encoded_energy, training=training)*msk_input
         if self.pt_skip_gate:
             pt_gate = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-            pred_log_pt = orig_log_pt + pt_gate*pred_pt_corr[:, :, 1:2]
+            pred_pt = orig_pt + pt_gate*pred_pt_corr[:, :, 1:2]
         else:
-            pred_log_pt = orig_log_pt*pred_pt_corr[:, :, 0:1] + pred_pt_corr[:, :, 1:2]
+            pred_pt = orig_pt*pred_pt_corr[:, :, 0:1] + pred_pt_corr[:, :, 1:2]
         
         if self.mask_reg_cls0:
             msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_hard_softmax, axis=-1)!=0, tf.float32), axis=-1)
             out_charge = out_charge*msk_output
-            pred_log_pt = pred_log_pt*msk_output
+            pred_pt = pred_pt*msk_output
             pred_eta = pred_eta*msk_output
             pred_sin_phi = pred_sin_phi*msk_output
             pred_cos_phi = pred_cos_phi*msk_output
-            pred_log_energy = pred_log_energy*msk_output
+            pred_energy = pred_energy*msk_output
 
         ret = {
             "cls": out_id_softmax,
             "charge": out_charge*msk_input,
-            "pt": pred_log_pt*msk_input,
+            "pt": pred_pt*msk_input,
             "eta": pred_eta*msk_input,
             "sin_phi": pred_sin_phi*msk_input,
             "cos_phi": pred_cos_phi*msk_input,
-            "energy": pred_log_energy*msk_input,
+            "energy": pred_energy*msk_input,
         }
 
         return ret
@@ -610,15 +597,10 @@ def call(self, args, training=False):
     def set_trainable_regression(self):
         self.ffn_id.trainable = False
         self.ffn_charge.trainable = False
-        self.ffn_phi.trainable = True
-        self.ffn_eta.trainable = True
-        self.ffn_pt.trainable = True
-
-        if self.classwise_split_energy:
-            for layer in self.ffn_energy:
-                layer.trainable = True
-        else:
-            self.ffn_energy.trainable = True
+        self.ffn_phi.trainable = False
+        self.ffn_eta.trainable = False
+        self.ffn_pt.trainable = False
+        self.ffn_energy.trainable = True
 
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
@@ -736,7 +718,7 @@ def call(self, inputs, training=False):
         if self.skip_connection:
             encs.append(enc)
         for cg in self.cg:
-            enc_all = cg(enc_cg, msk, training)
+            enc_all = cg(enc_cg, msk, training=training)
             enc_cg = enc_all["enc"]
             if self.debug:
                 debugging_data[cg.name] = enc_all
@@ -753,7 +735,7 @@ def call(self, inputs, training=False):
         if self.skip_connection:
             encs_energy.append(enc)
         for cg in self.cg_energy:
-            enc_all = cg(enc_cg, msk, training)
+            enc_all = cg(enc_cg, msk, training=training)
             enc_cg = enc_all["enc"]
             if self.debug:
                 debugging_data[cg.name] = enc_all
@@ -763,7 +745,7 @@ def call(self, inputs, training=False):
         if self.debug:
             debugging_data["dec_output_energy"] = dec_output_energy
 
-        ret = self.output_dec([X, dec_output, dec_output_energy, msk_input], training)
+        ret = self.output_dec([X, dec_output, dec_output_energy, msk_input], training=training)
 
         if self.debug:
             for k in debugging_data.keys():
@@ -782,58 +764,97 @@ def set_trainable_named(self, layer_names):
 
         self.output_dec.set_trainable_named(layer_names)
 
-    def train_step(self, data):
-        # Unpack the data. Its structure depends on your model and
-        # on what you pass to `fit()`.
-        x, y, sample_weights = data
-
-        with tf.GradientTape() as tape:
-            y_pred = self(x, training=True)  # Forward pass
-
-            #regression losses computed only for correctly classified particles
-            pred_cls = tf.argmax(y_pred["cls"], axis=-1)
-            true_cls = tf.argmax(y["cls"], axis=-1)
-            msk_loss = tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32)
-            sample_weights["energy"] *= msk_loss
-            sample_weights["pt"] *= msk_loss
-            sample_weights["eta"] *= msk_loss
-            sample_weights["sin_phi"] *= msk_loss
-            sample_weights["cos_phi"] *= msk_loss
-
-            loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
-
-        # Compute gradients
-        trainable_vars = self.trainable_variables
-        gradients = tape.gradient(loss, trainable_vars)
-        # Update weights
-        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
-        # Update metrics (includes the metric that tracks the loss)
-        self.compiled_metrics.update_state(y, y_pred)
-        # Return a dict mapping metric names to current value
-        return {m.name: m.result() for m in self.metrics}
-
-    def test_step(self, data):
-        # Unpack the data
-        x, y, sample_weights = data
-        # Compute predictions
-        y_pred = self(x, training=False)
-
-        pred_cls = tf.argmax(y_pred["cls"], axis=-1)
-        true_cls = tf.argmax(y["cls"], axis=-1)
-        msk_loss = tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32)
-        sample_weights["energy"] *= msk_loss
-        sample_weights["pt"] *= msk_loss
-        sample_weights["eta"] *= msk_loss
-        sample_weights["sin_phi"] *= msk_loss
-        sample_weights["cos_phi"] *= msk_loss
-
-        # Updates the metrics tracking the loss
-        self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
-        # Update the metrics.
-        self.compiled_metrics.update_state(y, y_pred)
-        # Return a dict mapping metric names to current value.
-        # Note that it will include the loss (tracked in self.metrics).
-        return {m.name: m.result() for m in self.metrics}
+    # def train_step(self, data):
+    #     # Unpack the data. Its structure depends on your model and
+    #     # on what you pass to `fit()`.
+    #     x, y, sample_weights = data
+
+    #     if not hasattr(self, "step"):
+    #         self.step = 0
+
+    #     with tf.GradientTape() as tape:
+    #         y_pred = self(x, training=True)  # Forward pass
+
+    #         #regression losses computed only for correctly classified particles
+    #         pred_cls = tf.argmax(y_pred["cls"], axis=-1)
+    #         true_cls = tf.argmax(y["cls"], axis=-1)
+    #         #msk_loss = tf.cast((pred_cls==true_cls) & (true_cls!=0), tf.float32)
+    #         #sample_weights["energy"] *= msk_loss
+    #         #sample_weights["pt"] *= msk_loss
+    #         #sample_weights["eta"] *= msk_loss
+    #         #sample_weights["sin_phi"] *= msk_loss
+    #         #sample_weights["cos_phi"] *= msk_loss
+
+    #         for icls in [3, ]:
+    #             msk1 = (true_cls==icls)
+    #             msk2 = (pred_cls==icls)
+    #             import matplotlib
+    #             import matplotlib.pyplot as plt
+    #             bins = np.linspace(0,6,100)
+
+    #             plt.figure(figsize=(4,4))
+    #             plt.scatter(
+    #                 y["energy"][msk1&msk2].numpy().flatten(),
+    #                 y_pred["energy"][msk1&msk2].numpy().flatten(),
+    #                 marker=".", alpha=0.5
+    #             )
+    #             plt.xlabel("true")
+    #             plt.ylabel("pred")
+    #             plt.plot([0,6], [0,6])
+    #             plt.savefig("train_cls{}_{}.png".format(icls, self.step), bbox_inches="tight")
+    #             plt.close("all")
+
+    #         loss = self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+
+    #     # Compute gradients
+    #     trainable_vars = self.trainable_variables
+    #     gradients = tape.gradient(loss, trainable_vars)
+    #     # Update weights
+    #     self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+    #     # Update metrics (includes the metric that tracks the loss)
+    #     self.compiled_metrics.update_state(y, y_pred)
+    #     # Return a dict mapping metric names to current value
+
+    #     self.step += 1
+    #     return {m.name: m.result() for m in self.metrics}
+
+    # def test_step(self, data):
+    #     # Unpack the data
+    #     x, y, sample_weights = data
+    #     # Compute predictions
+    #     y_pred = self(x, training=False)
+
+    #     pred_cls = tf.argmax(y_pred["cls"], axis=-1)
+    #     true_cls = tf.argmax(y["cls"], axis=-1)
+
+    #     for icls in [3, ]:
+    #         msk1 = (true_cls==icls)
+    #         msk2 = (pred_cls==icls)
+    #         import matplotlib
+    #         import matplotlib.pyplot as plt
+    #         bins = np.linspace(0,6,100)
+
+    #         plt.figure(figsize=(4,4))
+    #         plt.scatter(
+    #             y["energy"][msk1&msk2].numpy().flatten(),
+    #             y_pred["energy"][msk1&msk2].numpy().flatten(),
+    #             marker=".", alpha=0.5
+    #         )
+    #         plt.xlabel("true")
+    #         plt.ylabel("pred")
+    #         plt.plot([0,6], [0,6])
+    #         plt.savefig("test_cls{}_{}.png".format(icls, self.step), bbox_inches="tight")
+    #         plt.close("all")
+
+    #     # Updates the metrics tracking the loss
+    #     self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
+    #     # Update the metrics.
+    #     self.compiled_metrics.update_state(y, y_pred)
+    #     # Return a dict mapping metric names to current value.
+    #     # Note that it will include the loss (tracked in self.metrics).
+
+    #     self.step += 1
+    #     return {m.name: m.result() for m in self.metrics}
 
 class DummyNet(tf.keras.Model):
     def __init__(self,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 014d59432..a7fcd9e27 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -1,7 +1,6 @@
 from .model import DummyNet, PFNetDense
 
 import tensorflow as tf
-import tensorflow_probability
 import tensorflow_addons as tfa
 import pickle
 import numpy as np
@@ -92,11 +91,11 @@ def __init__(self, dataset_def, outpath, X, y, dataset_transform, num_output_cla
         }
 
         self.reg_bins = {
-            "pt": np.linspace(0, 5, 100),
+            "pt": np.linspace(0, 100, 100),
             "eta": np.linspace(-6, 6, 100),
             "sin_phi": np.linspace(-1,1,100),
             "cos_phi": np.linspace(-1,1,100),
-            "energy": np.linspace(0, 7,100),
+            "energy": None,
         }
 
     def plot_cm(self, epoch, outpath, ypred_id, msk):
@@ -155,7 +154,7 @@ def plot_event_visualization(self, epoch, outpath, ypred, ypred_id, msk, ievent=
         sphi = ypred["sin_phi"][ievent][msk]
         cphi = ypred["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = np.exp(np.clip(ypred["energy"][ievent][msk], -6, 6)) - 1.0
+        energy = ypred["energy"][ievent][msk]
         pdgid = ypred_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
@@ -169,7 +168,7 @@ def plot_event_visualization(self, epoch, outpath, ypred, ypred_id, msk, ievent=
         sphi = self.ytrue["sin_phi"][ievent][msk]
         cphi = self.ytrue["cos_phi"][ievent][msk]
         phi = np.arctan2(sphi, cphi)
-        energy = np.exp(np.clip(self.ytrue["energy"][ievent][msk], -6, 6)) - 1.0
+        energy = self.ytrue["energy"][ievent][msk]
         pdgid = self.ytrue_id[ievent][msk]
         plt.scatter(eta, phi, marker="o", s=energy, c=[self.color_map[p] for p in pdgid], alpha=0.5, linewidths=0)
         plt.xlim(-8,8)
@@ -181,16 +180,18 @@ def plot_event_visualization(self, epoch, outpath, ypred, ypred_id, msk, ievent=
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
 
-    def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variable):
+    def plot_reg_distribution(self, outpath, ypred, ypred_id, icls, reg_variable):
 
         if icls==0:
-            vals_pred = ypred[reg_variable][msk][ypred_id[msk]!=icls].flatten()
-            vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]!=icls].flatten()
+            vals_pred = ypred[reg_variable][ypred_id!=icls].flatten()
+            vals_true = self.ytrue[reg_variable][self.ytrue_id!=icls].flatten()
         else:
-            vals_pred = ypred[reg_variable][msk][ypred_id[msk]==icls].flatten()
-            vals_true = self.ytrue[reg_variable][msk][self.ytrue_id[msk]==icls].flatten()
+            vals_pred = ypred[reg_variable][ypred_id==icls].flatten()
+            vals_true = self.ytrue[reg_variable][self.ytrue_id==icls].flatten()
 
         bins = self.reg_bins[reg_variable]
+        if bins is None:
+            bins = 100
         plt.hist(vals_true, bins=bins, histtype="step", lw=2, label="true")
         plt.hist(vals_pred, bins=bins, histtype="step", lw=2, label="predicted")
 
@@ -205,40 +206,35 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, msk, icls, reg_variabl
         plt.savefig(str(outpath / "{}_cls{}.png".format(reg_variable, icls)), bbox_inches="tight")
         plt.close("all")
 
-    def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, log=False):
+    def plot_corr(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
 
         if icls==0:
-            sel = (self.ytrue_id[msk]!=0) & (ypred_id[msk]!=0)
+            sel = (ypred_id!=0) & (self.ytrue_id!=0)
         else:
-            sel = (ypred_id[msk]==icls) & (self.ytrue_id[msk]==icls)
+            sel = (ypred_id==icls) & (self.ytrue_id==icls)
 
-        vals_pred = ypred[reg_variable][msk][sel].flatten()
-        vals_true = self.ytrue[reg_variable][msk][sel].flatten()
+        vals_pred = ypred[reg_variable][sel].flatten()
+        vals_true = self.ytrue[reg_variable][sel].flatten()
 
         loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
         loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
 
-        #suffix for log-transformed variable
-        s = ""
-        if log:
-            vals_pred = np.log(vals_pred)
-            vals_true = np.log(vals_true)
-            s = "_log"
-
         #save correlation histogram
         plt.figure()
         bins = self.reg_bins[reg_variable]
-        plt.hist2d(vals_pred, vals_true, bins=(bins, bins), cmap="Blues")
+        if bins is None:
+            bins = 100
+        plt.hist2d(vals_true, vals_pred, bins=(bins, bins), cmap="Blues")
         plt.colorbar()
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
             if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
                 plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
-        plt.xlabel("predicted")
-        plt.ylabel("true")
+        plt.xlabel("true")
+        plt.ylabel("predicted")
         plt.title("{}, particle weighted, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
-        image_path = str(outpath / "{}_cls{}_corr{}.png".format(reg_variable, icls, s))
+        image_path = str(outpath / "{}_cls{}_corr.png".format(reg_variable, icls))
         plt.savefig(image_path, bbox_inches="tight")
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
@@ -246,17 +242,17 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
 
         #save loss-weighted correlation histogram
         plt.figure()
-        plt.hist2d(vals_pred, vals_true, bins=(bins, bins), weights=loss_vals, cmap="Blues")
+        plt.hist2d(vals_true, vals_pred, bins=(bins, bins), weights=loss_vals, cmap="Blues")
         plt.colorbar()
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
             if not (math.isnan(minval) or math.isnan(maxval) or math.isinf(minval) or math.isinf(maxval)):
                 plt.plot([minval, maxval], [minval, maxval], color="black", ls="--", lw=0.5)
-        plt.xlabel("predicted")
-        plt.ylabel("true")
+        plt.xlabel("true")
+        plt.ylabel("predicted")
         plt.title("{}, loss weighted, L={:.4f}".format(reg_variable, np.sum(loss_vals)))
-        image_path = str(outpath / "{}_cls{}_corr{}_weighted.png".format(reg_variable, icls, s))
+        image_path = str(outpath / "{}_cls{}_corr_weighted.png".format(reg_variable, icls))
         plt.savefig(image_path, bbox_inches="tight")
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
@@ -266,21 +262,21 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, msk, icls, reg_variable, lo
         residual = vals_true - vals_pred
         residual[np.isnan(residual)] = 0
         residual[np.isinf(residual)] = 0
-        plt.hist(residual, bins=np.linspace(-2,2,100))
+        plt.hist(residual, bins=100)
         plt.yscale("log")
         plt.xlabel("true - pred")
         plt.title("{} residual, m={:.4f} s={:.4f}".format(reg_variable, np.mean(residual), np.std(residual)))
 
-        image_path = str(outpath / "{}{}_cls{}_residual.png".format(reg_variable, s, icls))
+        image_path = str(outpath / "{}_cls{}_residual.png".format(reg_variable, icls))
         plt.savefig(image_path, bbox_inches="tight")
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
         plt.close("all")
 
         if self.comet_experiment:
-            self.comet_experiment.log_metric('residual_{}{}_cls{}_mean'.format(reg_variable, s, icls), np.mean(residual), step=epoch)
-            self.comet_experiment.log_metric('residual_{}{}_cls{}_std'.format(reg_variable, s, icls), np.std(residual), step=epoch)
-            self.comet_experiment.log_metric('val_loss_{}{}_cls{}'.format(reg_variable, s, icls), np.sum(loss_vals), step=epoch)
+            self.comet_experiment.log_metric('residual_{}_cls{}_mean'.format(reg_variable, icls), np.mean(residual), step=epoch)
+            self.comet_experiment.log_metric('residual_{}_cls{}_std'.format(reg_variable, icls), np.std(residual), step=epoch)
+            self.comet_experiment.log_metric('val_loss_{}_cls{}'.format(reg_variable, icls), np.sum(loss_vals), step=epoch)
 
     def on_epoch_end(self, epoch, logs=None):
 
@@ -296,6 +292,8 @@ def on_epoch_end(self, epoch, logs=None):
 
         #run the model inference on the validation dataset
         ypred = self.model.predict(self.X, batch_size=1)
+        #ypred = self.model(self.X, training=False)
+        #ypred = {k: v.numpy() for k, v in ypred.items()}
 
         #choose the class with the highest probability as the prediction
         #this is a shortcut, in actual inference, we may want to apply additional per-class thresholds        
@@ -312,14 +310,18 @@ def on_epoch_end(self, epoch, logs=None):
             cp_dir_cls = cp_dir / "cls_{}".format(icls)
             cp_dir_cls.mkdir(parents=True, exist_ok=True)
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
-                self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, msk, icls, variable)
-                self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, variable)
-            #self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "energy", log=True)
-            #self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, msk, icls, "pt", log=True)
+                self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, icls, variable)
+                self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, icls, variable)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
 
-def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes, dataset_def, plot_freq=1, comet_experiment=None):
+def prepare_callbacks(
+    model, outdir,
+    X_val, y_val,
+    dataset_transform,
+    num_output_classes,
+    dataset_def,
+    plot_freq=1, comet_experiment=None):
     callbacks = []
     tb = CustomTensorBoard(
         log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
@@ -346,7 +348,14 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     history_path = Path(outdir) / "history"
     history_path.mkdir(parents=True, exist_ok=True)
     history_path = str(history_path)
-    cb = CustomCallback(dataset_def, history_path, X_val, y_val, dataset_transform, num_output_classes, plot_freq=plot_freq, comet_experiment=comet_experiment)
+    cb = CustomCallback(
+        dataset_def, history_path,
+        X_val, y_val,
+        dataset_transform,
+        num_output_classes,
+        plot_freq=plot_freq,
+        comet_experiment=comet_experiment
+    )
     cb.set_model(model)
 
     callbacks += [cb]
@@ -416,7 +425,7 @@ def make_dense(config, dtype):
 
 def eval_model(X, ygen, ycand, model, config, outdir, global_batch_size):
     import scipy
-    for ibatch in tqdm(range(X.shape[0]//global_batch_size), desc="Evaluating model"):
+    for ibatch in tqdm(range(max(1, X.shape[0]//global_batch_size)), desc="Evaluating model"):
         nb1 = ibatch*global_batch_size
         nb2 = (ibatch+1)*global_batch_size
 
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 040792d76..6327adb98 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -185,11 +185,11 @@ def func(X, y, w):
             {
                 "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),
                 "charge": y[:, :, 1:2]*msk,
-                "pt": tf.math.log(y[:, :, 2:3] + 1.0)*msk,
+                "pt": y[:, :, 2:3]*msk,
                 "eta": y[:, :, 3:4]*msk,
                 "sin_phi": y[:, :, 4:5]*msk,
                 "cos_phi": y[:, :, 5:6]*msk,
-                "energy": tf.math.log(y[:, :, 6:7] + 1.0)*msk,
+                "energy": y[:, :, 6:7]*msk,
             },
             w,
         )
@@ -302,6 +302,10 @@ def set_config_loss(config, trainable):
     elif trainable == "regression":
         config["dataset"]["classification_loss_coef"] = 0.0
         config["dataset"]["charge_loss_coef"] = 0.0
+        config["dataset"]["pt_loss_coef"] = 0.0
+        config["dataset"]["eta_loss_coef"] = 0.0
+        config["dataset"]["sin_phi_loss_coef"] = 0.0
+        config["dataset"]["cos_phi_loss_coef"] = 0.0
     elif trainable == "all":
         pass
     return config
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index 9271963ba..e7bbd7f16 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -3,6 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "impressive-ethiopia",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,6 +25,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "statistical-ordering",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,6 +36,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "visible-destruction",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,6 +52,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "undefined-judges",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -80,6 +84,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "respective-theater",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,15 +126,17 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "stone-spanking",
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms-gnn-dense-dev_20210805_123408.gpu0.local/evaluation/\""
+    "path = \"../experiments/cms_20210830_174614_166595.joosep-desktop-work//evaluation/\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "blind-promotion",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,6 +159,7 @@
     "ygen = np.concatenate(ygens)\n",
     "ycand = np.concatenate(ycands)\n",
     "ypred = np.concatenate(ypreds)\n",
+    "\n",
     "ypred_raw = np.concatenate(ypreds_raw)\n",
     "\n",
     "X_f = X.reshape((X.shape[0]*X.shape[1], X.shape[2]))\n",
@@ -165,6 +173,22 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "opened-lyric",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icls = 2\n",
+    "msk = (ycand_f[:, 0]==icls) & (ypred_f[:, 0]==icls)\n",
+    "bins = np.linspace(0,100,100)\n",
+    "plt.hist(ypred_f[msk][:, 6], bins=bins, histtype=\"step\", lw=2);\n",
+    "plt.hist(ycand_f[msk][:, 6], bins=bins, histtype=\"step\", lw=2);\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "linear-eleven",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -174,6 +198,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "based-wrestling",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -183,9 +208,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "documented-savage",
+   "metadata": {},
    "outputs": [],
    "source": [
     "glob_iter = 0\n",
@@ -219,6 +243,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "resistant-abraham",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -228,6 +253,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "complex-difficulty",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -238,6 +264,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "patient-thunder",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -247,6 +274,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "revised-pollution",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -256,9 +284,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "virgin-nicaragua",
+   "metadata": {},
    "outputs": [],
    "source": [
     "for icls in range(1,8):\n",
@@ -281,6 +308,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "ready-macedonia",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -302,6 +330,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "formal-county",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -311,6 +340,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "neural-witch",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -335,6 +365,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "formal-maryland",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -346,6 +377,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "committed-clothing",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -357,6 +389,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "recreational-enhancement",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -368,6 +401,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "frank-alberta",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -379,6 +413,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "significant-breeding",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -390,6 +425,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "living-egyptian",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -401,6 +437,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "healthy-willow",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -412,6 +449,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "accomplished-brave",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -423,9 +461,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "august-feeding",
+   "metadata": {},
    "outputs": [],
    "source": [
     "for icls in range(1,8):\n",
@@ -449,6 +486,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "simple-forestry",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -471,6 +509,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "empirical-network",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -493,6 +532,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "prepared-fruit",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,6 +554,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "civilian-diving",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -529,9 +570,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "expressed-samba",
+   "metadata": {},
    "outputs": [],
    "source": [
     "for icls in range(1,8):\n",
@@ -554,6 +594,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "minor-beast",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -603,11 +644,19 @@
     "plt.tight_layout()\n",
     "plt.savefig(\"full_performance.png\", bbox_inches=\"tight\", dpi=400)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "linear-ceramic",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -621,7 +670,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/delphes-tf-mlpf-quickstart.ipynb b/notebooks/delphes-tf-mlpf-quickstart.ipynb
index 39988fbfe..24a81391b 100644
--- a/notebooks/delphes-tf-mlpf-quickstart.ipynb
+++ b/notebooks/delphes-tf-mlpf-quickstart.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "damaged-gentleman",
    "metadata": {},
    "source": [
     "This quickstart notebook allows to test and mess around with the MLPF GNN model in a standalone way. For actual training, we don't use a notebook, please refer to `README.md`.\n",
@@ -17,6 +18,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "happy-presence",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,6 +33,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "gentle-prompt",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,6 +44,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "imported-nightlife",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,6 +54,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "attached-helen",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -59,6 +64,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "enormous-merchant",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,6 +74,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "cloudy-warren",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,6 +85,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "blessed-noise",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -103,6 +111,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "upset-tractor",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -115,6 +124,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "hundred-cosmetic",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,6 +134,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "champion-institute",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -133,6 +144,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "previous-stranger",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,6 +164,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "nasty-staff",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -161,6 +174,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "optical-trinity",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,6 +184,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "pleasant-textbook",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -179,6 +194,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "acute-southwest",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,6 +206,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "elementary-hepatitis",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,6 +217,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "white-enhancement",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -210,6 +228,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "appointed-alberta",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -220,6 +239,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "variable-appointment",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -231,6 +251,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "steady-stock",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -263,6 +284,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "explicit-friendship",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -272,6 +294,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "interim-consciousness",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -281,6 +304,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "healthy-constraint",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -290,6 +314,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "annoying-fleet",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -301,6 +326,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "filled-suspension",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -313,6 +339,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "valued-better",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -325,6 +352,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "spiritual-fancy",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/notebooks/pfnet-debug.ipynb b/notebooks/pfnet-debug.ipynb
index dacfdb213..7cb5864fe 100644
--- a/notebooks/pfnet-debug.ipynb
+++ b/notebooks/pfnet-debug.ipynb
@@ -3,6 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "solved-relations",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,6 +31,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "unavailable-applicant",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -42,6 +44,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "becoming-district",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,6 +54,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "exact-landing",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,6 +77,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "identified-header",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -99,6 +104,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "reduced-collar",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -106,6 +112,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "expensive-incidence",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -115,6 +122,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "painful-delight",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,6 +132,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "collective-mounting",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -136,6 +145,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "western-petersburg",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -156,6 +166,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "possible-prime",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -181,6 +192,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "listed-quarterly",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,6 +202,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "convenient-winner",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -199,6 +212,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "cardiac-regression",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -208,6 +222,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "religious-rendering",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -217,6 +232,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "weekly-penetration",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,6 +242,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "upper-rapid",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -235,6 +252,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "superior-waterproof",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -254,6 +272,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "harmful-ultimate",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -265,6 +284,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "streaming-license",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -276,9 +296,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "celtic-techno",
+   "metadata": {},
    "outputs": [],
    "source": [
     "dmn = ret['cg_0']['dm'][0, :, :, :, 0]\n",
@@ -290,6 +309,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "silent-medium",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -302,6 +322,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "prostate-spider",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -314,6 +335,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "certified-enforcement",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -326,6 +348,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "portuguese-automation",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -338,6 +361,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "superb-explorer",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -350,6 +374,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "organized-unemployment",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -360,6 +385,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "competitive-flashing",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -369,6 +395,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "maritime-beaver",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -378,6 +405,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "acting-combat",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -387,6 +415,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "governmental-height",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -402,6 +431,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "worse-album",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -412,6 +442,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "combined-convention",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -431,6 +462,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "caroline-afghanistan",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -455,9 +487,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "later-hudson",
+   "metadata": {},
    "outputs": [],
    "source": [
     "[w.name for w in model.trainable_weights]"
@@ -466,6 +497,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "contemporary-peeing",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -475,9 +507,8 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "id": "scheduled-proposal",
+   "metadata": {},
    "outputs": [],
    "source": [
     "loss = tf.keras.losses.Huber()\n",
@@ -500,6 +531,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "acoustic-opening",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -512,6 +544,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "accompanied-musical",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -521,6 +554,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "ahead-literature",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -540,6 +574,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "progressive-auckland",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -549,6 +584,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "lesser-grant",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -559,6 +595,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "deluxe-twenty",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 61f0ead20..d541a3f58 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -109,7 +109,6 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
@@ -132,7 +131,7 @@ parameters:
     phi_num_layers: 3
     energy_num_layers: 3
     layernorm: yes
-    mask_reg_cls0: yes
+    mask_reg_cls0: no
 
   skip_connection: yes
   debug: no
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
deleted file mode 100644
index 20fa2dfa9..000000000
--- a/parameters/cms-gen.yaml
+++ /dev/null
@@ -1,143 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: gen
-  num_input_features: 15
-  num_output_features: 7
-#       NONE = 0,
-#       TRACK = 1,
-#       PS1 = 2,
-#       PS2 = 3,
-#       ECAL = 4,
-#       HCAL = 5,
-#       GSF = 6,
-#       BREM = 7,
-#       HFEM = 8,
-#       HFHAD = 9,
-#       SC = 10,
-#       HO = 11,
-  num_input_classes: 12
-  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
-  num_output_classes: 8
-  padded_num_elem_size: 6400
-  #(pt, eta, sin phi, cos phi, E)
-  num_momentum_outputs: 5
-  classification_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 100.0
-  eta_loss_coef: 100.0
-  sin_phi_loss_coef: 100.0
-  cos_phi_loss_coef: 100.0
-  energy_loss_coef: 100.0
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
-  num_files_per_chunk: 1
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
-  energy_loss:
-    type: Huber
-    delta: 1.0
-  pt_loss:
-    type: Huber
-    delta: 1.0
-  sin_phi_loss:
-    type: Huber
-    delta: 0.1
-  cos_phi_loss:
-    type: Huber
-    delta: 0.1
-  eta_loss:
-    type: Huber
-    delta: 0.1
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  weights_config:
-  lr: 1e-4
-  batch_size: 4
-  num_events_train: 80000
-  num_events_test: 9000
-  num_epochs: 40
-  num_val_files: 10
-  dtype: float32
-  trainable:
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: inverse_sqrt
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  input_encoding: cms
-  activation: gelu
-  layernorm: yes
-  hidden_dim: 256
-  bin_size: 320
-  distance_dim: 128
-  dropout: 0.2
-  graph_kernel:
-    type: NodePairGaussianKernel
-    dist_mult: 0.1
-    clip_value_low: 0.0
-  num_graph_layers: 5
-  node_message:
-    type: GHConvDense
-    output_dim: 256
-    activation: gelu
-    normalize_degrees: yes
-  num_node_messages: 1
-  output_decoding:
-    activation: gelu
-    regression_use_classification: yes
-    dropout: 0.2
-
-    pt_skip_gate: yes
-    eta_skip_gate: yes
-    phi_skip_gate: yes
-    energy_skip_gate: yes
-
-    id_dim_decrease: yes
-    charge_dim_decrease: yes
-    pt_dim_decrease: yes
-    eta_dim_decrease: yes
-    phi_dim_decrease: yes
-    energy_dim_decrease: yes
-
-    id_hidden_dim: 256
-    charge_hidden_dim: 256
-    pt_hidden_dim: 256
-    eta_hidden_dim: 256
-    phi_hidden_dim: 256
-    energy_hidden_dim: 256
-
-    id_num_layers: 4
-    charge_num_layers: 2
-    pt_num_layers: 3
-    eta_num_layers: 3
-    phi_num_layers: 3
-    energy_num_layers: 3
-    layernorm: yes
-    mask_reg_cls0: yes
-
-  skip_connection: yes
-  debug: no
-
-timing:
-  num_ev: 100
-  num_iter: 3
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 67d2c3890..c0f5d5258 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -25,19 +25,19 @@ dataset:
   num_momentum_outputs: 5
   classification_loss_coef: 1.0
   charge_loss_coef: 0.01
-  pt_loss_coef: 0.1
+  pt_loss_coef: 0.0001
   eta_loss_coef: 100.0
   sin_phi_loss_coef: 10.0
   cos_phi_loss_coef: 10.0
-  energy_loss_coef: 0.1
+  energy_loss_coef: 0.0001
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
   processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
-    type: MeanSquaredError
+    type: Huber
   pt_loss:
-    type: MeanSquaredError
+    type: Huber
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -56,10 +56,10 @@ setup:
   weights:
   weights_config:
   lr: 1e-4
-  batch_size: 2
+  batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 20
+  num_epochs: 100
   num_val_files: 10
   dtype: float32
   trainable:
@@ -79,7 +79,7 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   combined_graph_layer:
-    bin_size: 320
+    bin_size: 160
     max_num_bins: 100
     distance_dim: 128
     layernorm: no
@@ -91,10 +91,10 @@ parameters:
     num_node_messages: 1
     node_message:
       type: GHConvDense
-      output_dim: 256
+      output_dim: 128
       activation: gelu
       normalize_degrees: yes
-    hidden_dim: 256
+    hidden_dim: 128
     activation: gelu
   num_graph_layers_common: 3
   num_graph_layers_energy: 3
@@ -106,7 +106,6 @@ parameters:
     pt_skip_gate: no
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: no
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
@@ -115,22 +114,21 @@ parameters:
     phi_dim_decrease: yes
     energy_dim_decrease: yes
 
-    id_hidden_dim: 1024
+    id_hidden_dim: 256
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
-    energy_hidden_dim: 512
+    energy_hidden_dim: 256
 
-    id_num_layers: 3
+    id_num_layers: 2
     charge_num_layers: 2
-    pt_num_layers: 3
-    eta_num_layers: 3
-    phi_num_layers: 3
-    energy_num_layers: 4
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
     layernorm: no
     mask_reg_cls0: no
-    classwise_split_energy: no
 
   skip_connection: yes
   debug: no
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index ac359d2ca..10f04eb57 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -96,7 +96,6 @@ parameters:
     pt_skip_gate: yes
     eta_skip_gate: yes
     phi_skip_gate: yes
-    energy_skip_gate: yes
 
     id_dim_decrease: yes
     charge_dim_decrease: yes
@@ -119,8 +118,7 @@ parameters:
     phi_num_layers: 3
     energy_num_layers: 3
     layernorm: yes
-    mask_reg_cls0: yes
-    classwise_split_energy: no
+    mask_reg_cls0: no
 
   skip_connection: yes
   debug: no

From 8da1044005b10483bf5c3c39fec5fd4c22425274 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 31 Aug 2021 09:50:34 +0300
Subject: [PATCH 135/157] final update

---
 mlpf/pipeline.py             |  4 +-
 mlpf/tfmodel/model.py        |  2 +-
 mlpf/tfmodel/model_setup.py  |  6 +--
 notebooks/cms-mlpf.ipynb     | 22 +++++++---
 parameters/test-cms.yaml     | 83 ------------------------------------
 parameters/test-delphes.yaml | 83 ------------------------------------
 scripts/test_load_tfmodel.py |  2 +-
 7 files changed, 22 insertions(+), 180 deletions(-)
 delete mode 100644 parameters/test-cms.yaml
 delete mode 100644 parameters/test-delphes.yaml

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 5e2c2ed38..5bddd912b 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -186,7 +186,7 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
             initial_epoch = int(weights.split("/")[-1].split("-")[1])
         model(tf.cast(X_val[:1], model_dtype))
 
-        #config = set_config_loss(config, config["setup"]["trainable"])
+        config = set_config_loss(config, config["setup"]["trainable"])
         configure_model_weights(model, config["setup"]["trainable"])
         model(tf.cast(X_val[:1], model_dtype))
 
@@ -342,7 +342,7 @@ def find_lr(config, outdir, figname, logscale):
             model_dtype = tf.dtypes.float32
 
         model = make_model(config, model_dtype)
-        #config = set_config_loss(config, config["setup"]["trainable"])
+        config = set_config_loss(config, config["setup"]["trainable"])
 
         # Run model once to build the layers
         model(tf.cast(X_val[:1], model_dtype))
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 25cc57a04..a40451eeb 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -557,7 +557,7 @@ def call(self, args, training=False):
                 tf.concat([orig_energy, tf.stop_gradient(out_id_logits)], axis=-1), training=training),
             axis=-1, keepdims=True)
 
-        pred_energy = tf.math.exp(tf.clip_by_value(pred_energy, -3, 7))
+        pred_energy = tf.math.exp(tf.clip_by_value(pred_energy, -3, 8))
 
         #prediction is pred_log_energy=log(energy + 1.0), energy=exp(pred_log_energy) - 1.0
         #pred_energy = tf.math.exp(tf.clip_by_value(pred_log_energy, -6, 6)) - 1.0
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index a7fcd9e27..e42fdbc18 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -280,13 +280,13 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
 
     def on_epoch_end(self, epoch, logs=None):
 
-        if epoch%self.plot_freq!=0:
-            return
-
         #save the training logs (losses) for this epoch
         with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
             json.dump(logs, fi)
 
+        if epoch%self.plot_freq!=0:
+            return
+
         cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)
         cp_dir.mkdir(parents=True, exist_ok=True)
 
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index e7bbd7f16..b77607840 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -130,7 +130,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms_20210830_174614_166595.joosep-desktop-work//evaluation/\""
+    "path = \"../experiments/cms_20210830_181309_198541.joosep-desktop-work/evaluation/\""
    ]
   },
   {
@@ -179,10 +179,8 @@
    "source": [
     "icls = 2\n",
     "msk = (ycand_f[:, 0]==icls) & (ypred_f[:, 0]==icls)\n",
-    "bins = np.linspace(0,100,100)\n",
-    "plt.hist(ypred_f[msk][:, 6], bins=bins, histtype=\"step\", lw=2);\n",
-    "plt.hist(ycand_f[msk][:, 6], bins=bins, histtype=\"step\", lw=2);\n",
-    "plt.yscale(\"log\")"
+    "plt.scatter(ycand_f[msk, 2], 2*ypred_f[msk, 2], marker=\".\", alpha=0.4)\n",
+    "plt.plot([0,100], [0,100], color=\"black\", ls=\"--\")"
    ]
   },
   {
@@ -247,7 +245,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "thresholds = ret.x"
+    "thresholds = 0.0*ret.x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "involved-tobago",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "thresholds = [0.5, 0.6, 0.45, 0.56, 0.2, 0.85, 0.19]"
    ]
   },
   {
@@ -538,7 +546,7 @@
    "source": [
     "plt.figure(figsize=(8, 8))\n",
     "ax = plt.axes()\n",
-    "plt.imshow(cm, cmap=\"Blues\")\n",
+    "plt.imshow(cm, cmap=\"Blues\", norm=matplotlib.colors.LogNorm())\n",
     "plt.colorbar()\n",
     "\n",
     "cms_label(x1=0.18, x2=0.52, y=0.82)\n",
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
deleted file mode 100644
index eba673909..000000000
--- a/parameters/test-cms.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: cms
-  target_particles: cand
-  num_input_features: 15
-  num_output_features: 7
-  num_input_classes: 12
-  #(none, ch.had, n.had, hfem, hfhad, gamma, e, mu)
-  num_output_classes: 8
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-
-parameters:
-  model: gnn_dense
-  input_encoding: cms
-  activation: elu
-  layernorm: no
-  hidden_dim: 128
-  bin_size: 320
-  distance_dim: 128
-  dropout: 0.0
-  graph_kernel:
-    type: NodePairGaussianKernel
-    dist_mult: 0.1
-    clip_value_low: 0.1
-  num_graph_layers: 1
-  node_message:
-    type: GHConvDense
-    output_dim: 128
-    activation: elu
-    normalize_degrees: yes
-  num_node_messages: 1
-  regression_use_classification: yes
-  skip_connection: yes
-  debug: no
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
deleted file mode 100644
index 5bacba3c8..000000000
--- a/parameters/test-delphes.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-backend: tensorflow
-
-dataset:
-  schema: delphes
-  target_particles: gen
-  num_input_features: 12
-  num_output_features: 7
-  num_input_classes: 3
-  num_output_classes: 6
-  num_momentum_outputs: 5
-  padded_num_elem_size: 6400
-  classification_loss_coef: 1.0
-  momentum_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 1.0
-  eta_loss_coef: 1.0
-  sin_phi_loss_coef: 1.0
-  cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.001
-  raw_path: data/pythia8_ttbar/*.pkl.bz2
-  processed_path: data/pythia8_ttbar/tfr/*.tfrecords
-  num_files_per_chunk: 5
-  validation_file_path: data/pythia8_ttbar/val/*.pkl.bz2
-
-tensorflow:
-  eager: no
-
-setup:
-  train: yes
-  weights:
-  lr: 1e-6
-  batch_size: 5
-  num_events_train: 5
-  num_events_test: 5
-  num_epochs: 1
-  num_val_files: 1
-  dtype: float32
-  trainable: all
-  classification_loss_type: categorical_cross_entropy
-  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
-
-sample_weights:
-  cls: none
-  charge: signal_only
-  pt: signal_only
-  eta: signal_only
-  sin_phi: signal_only
-  cos_phi: signal_only
-  energy: signal_only
-  
-parameters:
-  model: gnn_dense
-  input_encoding: default
-  activation: elu
-  layernorm: no
-  hidden_dim: 128
-  bin_size: 320
-  distance_dim: 128
-  dropout: 0.0
-  graph_kernel:
-    type: NodePairGaussianKernel
-    dist_mult: 0.1
-    clip_value_low: 0.1
-  num_graph_layers: 1
-  node_message:
-    type: GHConvDense
-    output_dim: 128
-    activation: elu
-    normalize_degrees: yes
-  num_node_messages: 1
-  regression_use_classification: yes
-  skip_connection: yes
-  debug: no
-
-timing:
-  num_ev: 1
-  num_iter: 1
-
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
-
diff --git a/scripts/test_load_tfmodel.py b/scripts/test_load_tfmodel.py
index 754921e2a..76020bc3d 100644
--- a/scripts/test_load_tfmodel.py
+++ b/scripts/test_load_tfmodel.py
@@ -2,7 +2,7 @@
 import sys
 import numpy as np
 
-bin_size = 320
+bin_size = 160
 num_features = 15
 
 def load_graph(frozen_graph_filename):

From dff2f1dde6061f0088a54fff82fd63270b6ca1f0 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 31 Aug 2021 11:48:41 +0300
Subject: [PATCH 136/157] update cms-dev model

---
 mlpf/tfmodel/model.py   | 11 +++---
 parameters/cms-dev.yaml | 77 +++++++++++++++++++++--------------------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index a40451eeb..58132546b 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -790,9 +790,10 @@ def set_trainable_named(self, layer_names):
     #             msk2 = (pred_cls==icls)
     #             import matplotlib
     #             import matplotlib.pyplot as plt
-    #             bins = np.linspace(0,6,100)
 
     #             plt.figure(figsize=(4,4))
+    #             minval = np.min(y["energy"][msk1].numpy().flatten())
+    #             maxval = np.max(y["energy"][msk1].numpy().flatten())
     #             plt.scatter(
     #                 y["energy"][msk1&msk2].numpy().flatten(),
     #                 y_pred["energy"][msk1&msk2].numpy().flatten(),
@@ -800,7 +801,7 @@ def set_trainable_named(self, layer_names):
     #             )
     #             plt.xlabel("true")
     #             plt.ylabel("pred")
-    #             plt.plot([0,6], [0,6])
+    #             plt.plot([minval,maxval], [minval,maxval], color="black", ls="--", lw=1.0)
     #             plt.savefig("train_cls{}_{}.png".format(icls, self.step), bbox_inches="tight")
     #             plt.close("all")
 
@@ -832,9 +833,10 @@ def set_trainable_named(self, layer_names):
     #         msk2 = (pred_cls==icls)
     #         import matplotlib
     #         import matplotlib.pyplot as plt
-    #         bins = np.linspace(0,6,100)
 
     #         plt.figure(figsize=(4,4))
+    #         minval = np.min(y["energy"][msk1].numpy().flatten())
+    #         maxval = np.max(y["energy"][msk1].numpy().flatten())
     #         plt.scatter(
     #             y["energy"][msk1&msk2].numpy().flatten(),
     #             y_pred["energy"][msk1&msk2].numpy().flatten(),
@@ -842,10 +844,11 @@ def set_trainable_named(self, layer_names):
     #         )
     #         plt.xlabel("true")
     #         plt.ylabel("pred")
-    #         plt.plot([0,6], [0,6])
+    #         plt.plot([minval,maxval], [minval,maxval], color="black", ls="--", lw=1.0)
     #         plt.savefig("test_cls{}_{}.png".format(icls, self.step), bbox_inches="tight")
     #         plt.close("all")
 
+
     #     # Updates the metrics tracking the loss
     #     self.compiled_loss(y, y_pred, sample_weights, regularization_losses=self.losses)
     #     # Update the metrics.
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index d541a3f58..9cb4a785b 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -24,22 +24,20 @@ dataset:
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
   classification_loss_coef: 1.0
-  charge_loss_coef: 1.0
-  pt_loss_coef: 100.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
   eta_loss_coef: 100.0
-  sin_phi_loss_coef: 100.0
-  cos_phi_loss_coef: 100.0
-  energy_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
   raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
-  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1
   validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
   energy_loss:
     type: Huber
-    delta: 1.0
   pt_loss:
     type: Huber
-    delta: 1.0
   sin_phi_loss:
     type: Huber
     delta: 0.1
@@ -57,7 +55,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-3
+  lr: 1e-4
   batch_size: 4
   num_events_train: 80000
   num_events_test: 9000
@@ -80,33 +78,36 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
-  activation: gelu
-  layernorm: yes
-  hidden_dim: 256
-  bin_size: 32
-  distance_dim: 8
-  dropout: 0.0
-  graph_kernel:
-    type: NodePairTrainableKernel
-    output_dim: 8
-    hidden_dim: 32
-    num_layers: 2
-    activation: gelu
-  num_graph_layers: 3
-  node_message:
-    type: NodeMessageLearnable
-    output_dim: 256
+  combined_graph_layer:
+    bin_size: 32
+    max_num_bins: 500
+    distance_dim: 8
+    layernorm: no
+    dropout: 0.2
+    kernel:
+      type: NodePairTrainableKernel
+      output_dim: 8
+      hidden_dim: 32
+      num_layers: 2
+      activation: gelu
+    num_node_messages: 1
+    node_message:
+      type: NodeMessageLearnable
+      output_dim: 128
+      activation: gelu
+      hidden_dim: 128
+      num_layers: 2
+      aggregation_direction: dst
     hidden_dim: 128
-    num_layers: 2
     activation: gelu
-    aggregation_direction: dst
-  num_node_messages: 1
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
   output_decoding:
     activation: gelu
     regression_use_classification: yes
-    dropout: 0.0
+    dropout: 0.2
 
-    pt_skip_gate: yes
+    pt_skip_gate: no
     eta_skip_gate: yes
     phi_skip_gate: yes
 
@@ -124,13 +125,13 @@ parameters:
     phi_hidden_dim: 256
     energy_hidden_dim: 256
 
-    id_num_layers: 4
+    id_num_layers: 2
     charge_num_layers: 2
-    pt_num_layers: 3
-    eta_num_layers: 3
-    phi_num_layers: 3
-    energy_num_layers: 3
-    layernorm: yes
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
     mask_reg_cls0: no
 
   skip_connection: yes
@@ -141,6 +142,6 @@ timing:
   num_iter: 3
 
 exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
+  decay_steps: 1000
+  decay_rate: 0.98
   staircase: yes

From 7cb3d490a2ed7a7ca884bfd7c6d9c62caaa128b7 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Tue, 31 Aug 2021 13:05:30 +0300
Subject: [PATCH 137/157] update cms-dev

---
 parameters/cms-dev.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 9cb4a785b..2a7fed42b 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -2,7 +2,7 @@ backend: tensorflow
 
 dataset:
   schema: cms
-  target_particles: gen
+  target_particles: cand
   num_input_features: 15
   num_output_features: 7
 #       NONE = 0,
@@ -58,7 +58,7 @@ setup:
   lr: 1e-4
   batch_size: 4
   num_events_train: 80000
-  num_events_test: 9000
+  num_events_test: 10000
   num_epochs: 100
   num_val_files: 10
   dtype: float32

From 13644f5568e05789eb264d590f0d1184a2b8e617 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 31 Aug 2021 12:41:12 +0200
Subject: [PATCH 138/157] fix: raytune-analysis plots include val_loss

---
 mlpf/pipeline.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 00625b5f5..a6cb9c8ff 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -487,9 +487,10 @@ def func(key):
         s += "{}={}; ".format(hp, result["config/{}".format(hp)].values[0])
     return s
 
-def plot_ray_analysis(analysis, save=False):
+def plot_ray_analysis(analysis, save=False, skip=0):
     to_plot = [
-    'adam_beta_1', 'charge_loss', 'cls_acc_unweighted', 'cls_loss',
+    #'adam_beta_1',
+       'charge_loss', 'cls_acc_unweighted', 'cls_loss',
        'cos_phi_loss', 'energy_loss', 'eta_loss', 'learning_rate', 'loss',
        'pt_loss', 'sin_phi_loss', 'val_charge_loss',
        'val_cls_acc_unweighted', 'val_cls_acc_weighted', 'val_cls_loss',
@@ -502,12 +503,10 @@ def plot_ray_analysis(analysis, save=False):
     for key in tqdm(dfs.keys(), desc="Creating Ray analysis plots", total=len(dfs.keys())):
         result = result_df[result_df["logdir"] == key]
 
-        fig, axs = plt.subplots(4, 4, figsize=(12, 9), tight_layout=True)
-        for ax in axs.flat:
-            ax.label_outer()
-
+        fig, axs = plt.subplots(5, 4, figsize=(12, 9), tight_layout=True)
         for var, ax in zip(to_plot, axs.flat):
-            ax.plot(dfs[key].index.values, dfs[key][var], alpha=0.8)
+            # Skip first `skip` values so loss plots don't include the very large losses which occur at start of training
+            ax.plot(dfs[key].index.values[skip:], dfs[key][var][skip:], alpha=0.8)
             ax.set_xlabel("Epoch")
             ax.set_ylabel(var)
             ax.grid(alpha=0.3)
@@ -515,6 +514,7 @@ def plot_ray_analysis(analysis, save=False):
 
         if save:
             plt.savefig(key + "/trial_summary.jpg")
+            plt.close()
     if not save:
         plt.show()
     else:
@@ -582,7 +582,7 @@ def raytune(config, name, local, cpus, gpus, tune_result_dir):
     )
     print("Best hyperparameters found were: ", analysis.get_best_config("val_loss", "min"))
 
-    plot_ray_analysis(analysis, save=True)
+    plot_ray_analysis(analysis, save=True, skip=20)
     ray.shutdown()
 
 
@@ -590,9 +590,10 @@ def raytune(config, name, local, cpus, gpus, tune_result_dir):
 @click.help_option("-h", "--help")
 @click.option("-d", "--exp_dir", help="experiment dir", type=click.Path())
 @click.option("-s", "--save", help="save plots in trial dirs", is_flag=True)
-def raytune_analysis(exp_dir, save):
-    analysis = Analysis(exp_dir)
-    plot_ray_analysis(analysis, save=save)
+@click.option("-k", "--skip", help="skip first values to avoid large losses at start of training", type=int)
+def raytune_analysis(exp_dir, save, skip):
+    analysis = Analysis(exp_dir,  default_metric="val_loss", default_mode="min")
+    plot_ray_analysis(analysis, save=save, skip=skip)
 
 
 if __name__ == "__main__":

From 515c1497cc3dbe24789c83d92d1ab3e820a8f067 Mon Sep 17 00:00:00 2001
From: Farouk <farouk.mokhtar@gmail.com>
Date: Tue, 31 Aug 2021 09:26:41 -0700
Subject: [PATCH 139/157] remove lrp part from local script

---
 scripts/local_test_delphes_pytorch.sh | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/scripts/local_test_delphes_pytorch.sh b/scripts/local_test_delphes_pytorch.sh
index 134cd9fcd..4d0fa84be 100755
--- a/scripts/local_test_delphes_pytorch.sh
+++ b/scripts/local_test_delphes_pytorch.sh
@@ -14,7 +14,7 @@ mkdir -p data/pythia8_qcd
 mkdir -p data/pythia8_qcd/raw
 mkdir -p data/pythia8_qcd/processed
 
-# download 2 files for training/validation
+#download 2 files for training/validation
 cd data/pythia8_ttbar/raw
 echo Downloading the training/validation data files..
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
@@ -22,7 +22,7 @@ wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14
 bzip2 -d *
 cd ../../..
 
-# download 1 file for testing
+#download 1 file for testing
 cd data/pythia8_qcd/raw
 echo Downloading the testing data files..
 wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_qcd_10_0.pkl.bz2
@@ -39,7 +39,7 @@ echo Processing the testing data files..
 python3 ../mlpf/pytorch_delphes/graph_data_delphes.py --dataset data/pythia8_qcd/ \
   --processed_dir data/pythia8_qcd/processed --num-files-merge 1 --num-proc 1
 
-# before training a model, first get rid of any previous models stored
+#before training a model, first get rid of any previous models stored
 rm -Rf experiments/*
 
 cd ../mlpf/
@@ -53,10 +53,11 @@ python3 pytorch_pipeline.py \
   --outpath='../test_tmp_delphes/experiments'
 echo Finished the training..
 
-echo Begining the LRP machinery..
-python3 lrp_pipeline.py \
-  --n_test=1 --batch_size=4 \
-  --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
-  --lrp_outpath='../test_tmp_delphes/experiments/' \
-  --lrp_load_model='PFNet7_gen_ntrain_1_nepochs_10_batch_size_4_lr_0.0001_alpha_0.0002_both__nn1_nn3'
-  --lrp_load_epoch=9
+# #to run lrp uncomment the next few lines (note: lrp requires huge amounts of memory ~128Gi)
+# echo Begining the LRP machinery..
+# python3 lrp_pipeline.py \
+#   --n_test=1 --batch_size=4 \
+#   --lrp_dataset_qcd='../test_tmp_delphes/data/pythia8_qcd' \
+#   --lrp_outpath='../test_tmp_delphes/experiments/' \
+#   --lrp_load_model='PFNet7_gen_ntrain_1_nepochs_10_batch_size_4_lr_0.0001_alpha_0.0002_both__nn1_nn3'
+#   --lrp_load_epoch=9

From 9ce68d092ddee7bbfb8b4fe83cfec6e546908bd2 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 31 Aug 2021 22:56:06 +0300
Subject: [PATCH 140/157] add dist activation as configurable

---
 mlpf/tfmodel/model.py   |  3 ++-
 parameters/cms-dev.yaml | 35 ++++++++++++++++-------------------
 parameters/cms.yaml     |  1 +
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 58132546b..f6c90a0fd 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -615,6 +615,7 @@ def __init__(self, *args, **kwargs):
         self.node_message = kwargs.pop("node_message")
         self.hidden_dim = kwargs.pop("hidden_dim")
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
+        self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation"))
 
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6, name=kwargs.get("name")+"_layernorm")
@@ -648,7 +649,7 @@ def call(self, x, msk, training=False):
             x = self.layernorm(x, training=training)
 
         #compute node features for graph building
-        x_dist = self.activation(self.ffn_dist(x, training=training))
+        x_dist = self.dist_activation(self.ffn_dist(x, training=training))
 
         #x_dist = self.gaussian_noise(x_dist, training=training)
         #compute the element-to-element messages / distance matrix / graph structure
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 2a7fed42b..58578bce8 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -55,8 +55,8 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
-  batch_size: 4
+  lr: 1e-3
+  batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 100
@@ -79,33 +79,30 @@ parameters:
   model: gnn_dense
   input_encoding: cms
   combined_graph_layer:
-    bin_size: 32
-    max_num_bins: 500
-    distance_dim: 8
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 128
     layernorm: no
-    dropout: 0.2
+    dropout: 0.0
+    dist_activation: linear
     kernel:
-      type: NodePairTrainableKernel
-      output_dim: 8
-      hidden_dim: 32
-      num_layers: 2
-      activation: gelu
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
     num_node_messages: 1
     node_message:
-      type: NodeMessageLearnable
+      type: GHConvDense
       output_dim: 128
       activation: gelu
-      hidden_dim: 128
-      num_layers: 2
-      aggregation_direction: dst
+      normalize_degrees: yes
     hidden_dim: 128
     activation: gelu
-  num_graph_layers_common: 2
-  num_graph_layers_energy: 2
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
   output_decoding:
     activation: gelu
     regression_use_classification: yes
-    dropout: 0.2
+    dropout: 0.0
 
     pt_skip_gate: no
     eta_skip_gate: yes
@@ -125,7 +122,7 @@ parameters:
     phi_hidden_dim: 256
     energy_hidden_dim: 256
 
-    id_num_layers: 2
+    id_num_layers: 3
     charge_num_layers: 2
     pt_num_layers: 2
     eta_num_layers: 2
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index c0f5d5258..07bfb8d08 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -84,6 +84,7 @@ parameters:
     distance_dim: 128
     layernorm: no
     dropout: 0.0
+    dist_activation: gelu
     kernel:
       type: NodePairGaussianKernel
       dist_mult: 0.1

From a2356f36367417eec38c6930c07cf92f90c15b7a Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 1 Sep 2021 11:27:29 +0300
Subject: [PATCH 141/157] more monitoring

---
 mlpf/tfmodel/data.py        |   2 +-
 mlpf/tfmodel/model.py       |  34 +++-
 mlpf/tfmodel/model_setup.py | 118 +++++++++++-
 notebooks/cms-mlpf.ipynb    | 358 ++++++++++++++++++++++++++++--------
 notebooks/cmssw.ipynb       |   4 +-
 parameters/cms-dev.yaml     |  20 +-
 parameters/cms.yaml         |   4 +
 7 files changed, 447 insertions(+), 93 deletions(-)

diff --git a/mlpf/tfmodel/data.py b/mlpf/tfmodel/data.py
index d89786b8b..b9e0c0f57 100644
--- a/mlpf/tfmodel/data.py
+++ b/mlpf/tfmodel/data.py
@@ -237,7 +237,7 @@ def serialize_chunk(self, path, files, ichunk):
         Xs = np.concatenate(Xs)
         ys = np.concatenate(ys)
 
-        #set weights for each sample to be equal to the number of samples of this type
+        #set weights for each sample to be equal to the number of target particles of this type
         #in the training script, this can be used to compute either inverse or class-balanced weights
         uniq_vals, uniq_counts = np.unique(np.concatenate([y[:, 0] for y in ys]), return_counts=True)
         for i in range(len(ys)):
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index f6c90a0fd..d835deab1 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -601,6 +601,16 @@ def set_trainable_regression(self):
         self.ffn_eta.trainable = False
         self.ffn_pt.trainable = False
         self.ffn_energy.trainable = True
+        self.ffn_energy_classwise.trainable = True
+
+    def set_trainable_classification(self):
+        self.ffn_id.trainable = True
+        self.ffn_charge.trainable = True
+        self.ffn_phi.trainable = False
+        self.ffn_eta.trainable = False
+        self.ffn_pt.trainable = False
+        self.ffn_energy.trainable = False
+        self.ffn_energy_classwise.trainable = False
 
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
@@ -670,6 +680,10 @@ def call(self, x, msk, training=False):
 
 class PFNetDense(tf.keras.Model):
     def __init__(self,
+            do_node_encoding=False,
+            hidden_dim=128,
+            dropout=0.0,
+            activation="gelu",
             multi_output=False,
             num_input_classes=8,
             num_output_classes=3,
@@ -690,6 +704,21 @@ def __init__(self,
         self.debug = debug
 
         self.skip_connection = skip_connection
+        
+        self.do_node_encoding = do_node_encoding
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.activation = getattr(tf.keras.activations, activation)
+
+        if self.do_node_encoding:
+            self.node_encoding = point_wise_feed_forward_network(
+                self.hidden_dim,
+                self.hidden_dim,
+                "node_encoding",
+                num_layers=2,
+                activation=self.activation,
+                dropout=self.dropout
+            )
 
         if input_encoding == "cms":
             self.enc = InputEncodingCMS(num_input_classes)
@@ -714,10 +743,13 @@ def call(self, inputs, training=False):
         #encode the elements for classification (id)
         enc = self.enc(X)
 
-        enc_cg = enc
+
         encs = []
         if self.skip_connection:
             encs.append(enc)
+        enc_cg = enc
+        if self.do_node_encoding:
+            enc_cg = self.node_encoding(enc_cg, training=training)
         for cg in self.cg:
             enc_all = cg(enc_cg, msk, training=training)
             enc_cg = enc_all["enc"]
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index e42fdbc18..a44d60a98 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -21,6 +21,7 @@
 import random
 import math
 import platform
+import mplhep
 from tqdm import tqdm
 from pathlib import Path
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -180,7 +181,7 @@ def plot_event_visualization(self, epoch, outpath, ypred, ypred_id, msk, ievent=
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
 
-    def plot_reg_distribution(self, outpath, ypred, ypred_id, icls, reg_variable):
+    def plot_reg_distribution(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
 
         if icls==0:
             vals_pred = ypred[reg_variable][ypred_id!=icls].flatten()
@@ -203,8 +204,11 @@ def plot_reg_distribution(self, outpath, ypred, ypred_id, icls, reg_variable):
         plt.ylabel("Number of particles")
         plt.legend(loc="best")
         plt.title("Regression output, cls {}".format(icls))
-        plt.savefig(str(outpath / "{}_cls{}.png".format(reg_variable, icls)), bbox_inches="tight")
+        image_path = str(outpath / "{}_cls{}.png".format(reg_variable, icls))
+        plt.savefig(image_path, bbox_inches="tight")
         plt.close("all")
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
 
     def plot_corr(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
 
@@ -219,13 +223,13 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
         loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
         loss_vals = loss(np.expand_dims(vals_true, -1), np.expand_dims(vals_pred, axis=-1)).numpy()
 
-        #save correlation histogram
+        #save scatterplot of raw values
         plt.figure()
         bins = self.reg_bins[reg_variable]
         if bins is None:
             bins = 100
-        plt.hist2d(vals_true, vals_pred, bins=(bins, bins), cmap="Blues")
-        plt.colorbar()
+        plt.scatter(vals_true, vals_pred, marker=".", alpha=0.4)
+
         if len(vals_true) > 0:
             minval = np.min(vals_true)
             maxval = np.max(vals_true)
@@ -278,6 +282,91 @@ def plot_corr(self, epoch, outpath, ypred, ypred_id, icls, reg_variable):
             self.comet_experiment.log_metric('residual_{}_cls{}_std'.format(reg_variable, icls), np.std(residual), step=epoch)
             self.comet_experiment.log_metric('val_loss_{}_cls{}'.format(reg_variable, icls), np.sum(loss_vals), step=epoch)
 
+    def plot_elem_to_pred(self, epoch, cp_dir, msk, ypred_id):
+        X_id = self.X[msk][:, 0]
+        max_elem = int(np.max(X_id))
+        cand_id = self.ytrue_id[msk]
+        pred_id = ypred_id[msk]
+        cm1 = sklearn.metrics.confusion_matrix(X_id, cand_id, labels=range(max_elem))
+        cm2 = sklearn.metrics.confusion_matrix(X_id, pred_id, labels=range(max_elem))
+
+        plt.figure(figsize=(10,4))
+
+        ax = plt.subplot(1,2,1)
+        plt.title("Targets")
+        plt.imshow(cm1, cmap="Blues", norm=matplotlib.colors.LogNorm())
+        plt.xticks(range(12));
+        plt.yticks(range(12));
+        plt.xlabel("Particle id")
+        plt.ylabel("PFElement id")
+        plt.colorbar()
+
+        ax = plt.subplot(1,2,2)
+        plt.title("Predictions")
+        plt.imshow(cm2, cmap="Blues", norm=matplotlib.colors.LogNorm())
+        plt.xticks(range(12));
+        plt.yticks(range(12));
+        plt.xlabel("Particle id")
+        plt.ylabel("PFElement id")
+        plt.colorbar()
+
+        image_path = str(cp_dir / "elem_to_pred.png")
+        plt.savefig(image_path, bbox_inches="tight")
+
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
+    def plot_eff_and_fake_rate(
+        self,
+        epoch,
+        icls,
+        msk,
+        ypred_id,
+        cp_dir,
+        ivar=4,
+        bins=np.linspace(-3,6,100),
+        xlabel="PFElement log[E/GeV]",
+        log=True
+        ):
+        
+        values = self.X[msk][:, ivar]
+        cand_id = self.ytrue_id[msk]
+        pred_id = ypred_id[msk]
+
+        if log:
+            values = np.log(values)
+            
+        hist_cand = np.histogram(values[(cand_id==icls)], bins=bins);
+        hist_cand_true = np.histogram(values[(cand_id==icls) & (pred_id==icls)], bins=bins);
+
+        hist_pred = np.histogram(values[(pred_id==icls)], bins=bins);
+        hist_pred_fake = np.histogram(values[(cand_id!=icls) & (pred_id==icls)], bins=bins);
+
+        eff = hist_cand_true[0]/hist_cand[0]
+        fake = hist_pred_fake[0]/hist_pred[0]
+
+        plt.figure(figsize=(8,8))
+        ax = plt.subplot(2,1,1)
+        mplhep.histplot(hist_cand, label="PF")
+        mplhep.histplot(hist_pred, label="MLPF")
+        plt.legend()
+        plt.xlabel(xlabel)
+        plt.ylabel("Number of particles")
+
+        ax = plt.subplot(2,1,2, sharex=ax)
+        mplhep.histplot(eff, bins=hist_cand[1], label="efficiency", color="black")
+        mplhep.histplot(fake, bins=hist_cand[1], label="fake rate", color="red")
+        plt.legend(frameon=False)
+        plt.ylim(0,1.4)
+        plt.xlabel(xlabel)
+        plt.ylabel("Fraction of particles / bin")
+
+        image_path = str(cp_dir / "eff_fake_cls{}.png".format(icls))
+        plt.savefig(image_path, bbox_inches="tight")
+
+        if self.comet_experiment:
+            self.comet_experiment.log_image(image_path, step=epoch)
+
     def on_epoch_end(self, epoch, logs=None):
 
         #save the training logs (losses) for this epoch
@@ -302,6 +391,8 @@ def on_epoch_end(self, epoch, logs=None):
         #exclude padded elements from the plotting
         msk = self.X[:, :, 0] != 0
 
+        self.plot_elem_to_pred(epoch, cp_dir, msk, ypred_id)
+
         self.plot_cm(epoch, cp_dir, ypred_id, msk)
         for ievent in range(min(5, self.X.shape[0])):
             self.plot_event_visualization(epoch, cp_dir, ypred, ypred_id, msk, ievent=ievent)
@@ -309,8 +400,12 @@ def on_epoch_end(self, epoch, logs=None):
         for icls in range(self.num_output_classes):
             cp_dir_cls = cp_dir / "cls_{}".format(icls)
             cp_dir_cls.mkdir(parents=True, exist_ok=True)
+
+            if icls!=0:
+                self.plot_eff_and_fake_rate(epoch, icls, msk, ypred_id, cp_dir_cls)
+
             for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
-                self.plot_reg_distribution(cp_dir_cls, ypred, ypred_id, icls, variable)
+                self.plot_reg_distribution(epoch, cp_dir_cls, ypred, ypred_id, icls, variable)
                 self.plot_corr(epoch, cp_dir_cls, ypred, ypred_id, icls, variable)
 
         np.savez(str(cp_dir/"pred.npz"), X=self.X, ytrue=self.y, **ypred)
@@ -395,6 +490,10 @@ def make_model(config, dtype):
 def make_gnn_dense(config, dtype):
 
     parameters = [
+        "do_node_encoding",
+        "hidden_dim",
+        "dropout",
+        "activation",
         "num_graph_layers_common",
         "num_graph_layers_energy",
         "input_encoding",
@@ -548,6 +647,13 @@ def configure_model_weights(model, trainable_layers):
             cg.trainable = True
 
         model.output_dec.set_trainable_regression()
+    elif trainable_layers == "classification":
+        for cg in model.cg:
+            cg.trainable = True
+        for cg in model.cg_energy:
+            cg.trainable = False
+
+        model.output_dec.set_trainable_classification()
     else:
         if isinstance(trainable_layers, str):
             trainable_layers = [trainable_layers]
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index b77607840..c78738457 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -130,7 +130,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms_20210830_181309_198541.joosep-desktop-work/evaluation/\""
+    "path = \"../experiments/cms-dev_20210831_225815_541048.gpu0.local/evaluation/\""
    ]
   },
   {
@@ -167,150 +167,358 @@
     "ygen_f = ygen.reshape((ygen.shape[0]*ygen.shape[1], ygen.shape[2]))\n",
     "ycand_f = ycand.reshape((ycand.shape[0]*ycand.shape[1], ycand.shape[2]))\n",
     "ypred_f = ypred.reshape((ypred.shape[0]*ypred.shape[1], ypred.shape[2]))\n",
-    "ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))"
+    "\n",
+    "# ypred_raw[X[:, :, 0]==1, 6] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==4, 1] = 0.0\n",
+    "# #ypred_raw[X[:, :, 0]==4, 6] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==5, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==5, 7] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==8, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==9, 1] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==8, 2] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==9, 2] = 0.0\n",
+    "\n",
+    "ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))\n",
+    "\n",
+    "ypred_id = np.argmax(ypred_raw, axis=-1)\n",
+    "\n",
+    "ypred_id_f = ypred_id.flatten()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "opened-lyric",
+   "id": "floral-people",
    "metadata": {},
    "outputs": [],
    "source": [
-    "icls = 2\n",
-    "msk = (ycand_f[:, 0]==icls) & (ypred_f[:, 0]==icls)\n",
-    "plt.scatter(ycand_f[msk, 2], 2*ypred_f[msk, 2], marker=\".\", alpha=0.4)\n",
-    "plt.plot([0,100], [0,100], color=\"black\", ls=\"--\")"
+    "np.unique(ypred_id[X[:, :, 0]==4], return_counts=True)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "linear-eleven",
+   "id": "cooked-bullet",
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.std(ycand_f[ycand_f[:, 0]!=0, 4])"
+    "# thresholds = [0.6, 0.7, 0.0, 0, 0, 0, 0]\n",
+    "# ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
+    "# ypred_id_f = apply_thresholds_f(ypred_raw_f, thresholds)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "based-wrestling",
+   "id": "virgin-nicaragua",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.hist(np.log(ycand_f[ycand_f[:, 0]!=0, 6]), bins=100);"
+    "for icls in range(1,8):\n",
+    "    npred = np.sum(ypred_id == icls, axis=1)\n",
+    "    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)\n",
+    "    plt.figure(figsize=(6,6))\n",
+    "    plt.scatter(ncand, npred, marker=\".\", alpha=0.8)\n",
+    "    a = 0.5*min(np.min(npred), np.min(ncand))\n",
+    "    b = 1.5*max(np.max(npred), np.max(ncand))\n",
+    "    plt.xlim(a,b)\n",
+    "    plt.ylim(a,b)\n",
+    "    plt.plot([a,b],[a,b], color=\"black\", ls=\"--\")\n",
+    "    plt.title(pid_names_long[icls],y=1.05)\n",
+    "    plt.xlabel(\"number of PFCandidates\")\n",
+    "    plt.ylabel(\"number of MLPFCandidates\")\n",
+    "    cms_label(x2=0.6, y=0.89)\n",
+    "    plt.savefig(\"num_cls{}.pdf\".format(icls))\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "documented-savage",
+   "id": "reported-button",
    "metadata": {},
    "outputs": [],
    "source": [
-    "glob_iter = 0\n",
-    "def multiplicity_score(thresholds):\n",
-    "    global glob_iter\n",
-    "    ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
-    "    total_scores = []\n",
-    "    for icls in range(1,8):\n",
-    "        ntrue = np.sum((ycand[:, :, 0]==icls)*msk_X, axis=1)\n",
-    "        npred = np.sum((ypred_id==icls)*msk_X, axis=1)\n",
-    "        diff = np.sqrt(np.sum((ntrue-npred)**2))/np.mean(ntrue)\n",
-    "        total_scores.append(diff)\n",
-    "        #print(\"  \", icls, np.mean(ntrue), np.mean(npred), diff)\n",
-    "    glob_iter += 1\n",
-    "    if glob_iter%10 == 0:\n",
-    "        print(glob_iter, np.sum(total_scores))\n",
-    "        print(\",\\t\".join([\"{:.2f}\".format(x) for x in thresholds]))\n",
-    "        print(\",\\t\".join([\"{:.2f}\".format(x) for x in total_scores]))\n",
-    "    return np.sum(total_scores)\n",
+    "energy_bins_classwise = {\n",
+    "    1: [-2, 5],\n",
+    "    2: [-2, 6],\n",
+    "    3: [1, 7],\n",
+    "    4: [2, 5],\n",
+    "    5: [2, 5],\n",
+    "    6: [2, 5],\n",
+    "    7: [2, 5],\n",
+    "}\n",
     "\n",
-    "ret = scipy.optimize.minimize(\n",
-    "    multiplicity_score,\n",
-    "    0.5*np.ones(7),\n",
-    "    tol=1e-5,\n",
-    "    method=\"Powell\",\n",
-    "    bounds=[(0,1) for i in range(7)],\n",
-    "    #options={\"ftol\": 1e-6, \"disp\":True}\n",
-    ")"
+    "energy_correction_factors = {\n",
+    "    1: [1, 1],\n",
+    "    2: [1, 1],\n",
+    "    3: [1.0, 1.2],\n",
+    "    4: [1, 1],\n",
+    "    5: [1, 1],\n",
+    "    6: [1, 1],\n",
+    "    7: [1, 1],\n",
+    "}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "resistant-abraham",
+   "id": "chronic-discovery",
    "metadata": {},
    "outputs": [],
    "source": [
-    "thresholds = 0.0*ret.x"
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==0), 1], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==1), 1], bins=b, histtype=\"step\", lw=2, label=\"charged PFCandidate\", density=True);\n",
+    "plt.legend(loc=2, frameon=False)\n",
+    "plt.xlabel(\"Charged hadron probability\")\n",
+    "plt.title(\"Tracks\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "involved-tobago",
+   "id": "multiple-disco",
    "metadata": {},
    "outputs": [],
    "source": [
-    "thresholds = [0.5, 0.6, 0.45, 0.56, 0.2, 0.85, 0.19]"
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==0), 0], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==1) & (ycand_f[:, 0]==1), 0], bins=b, histtype=\"step\", lw=2, label=\"charged PFCandidate\", density=True);\n",
+    "plt.legend(loc=1, frameon=False)\n",
+    "plt.xlabel(\"No particle probability\")\n",
+    "plt.title(\"Tracks\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "complex-difficulty",
+   "id": "innocent-black",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
-    "ypred_id_f =  apply_thresholds_f(ypred_raw_f, thresholds)"
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==0), 2], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==2), 2], bins=b, histtype=\"step\", lw=2, label=\"neutral PFCandidate\", density=True);\n",
+    "plt.legend(loc=2, frameon=False)\n",
+    "plt.xlabel(\"Neutral probability\")\n",
+    "plt.title(\"HCAL clusters\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "patient-thunder",
+   "id": "flying-mason",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.balanced_accuracy_score(ycand_f[msk_X_f, 0], ypred_f[:, 0][msk_X_f])"
+    "b = np.linspace(0,1,101)\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==0), 0], bins=b, histtype=\"step\", lw=2, label=\"no PFCandidate\", density=True);\n",
+    "plt.hist(ypred_raw_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==2), 0], bins=b, histtype=\"step\", lw=2, label=\"neutral PFCandidate\", density=True);\n",
+    "plt.legend(loc=\"best\", frameon=False)\n",
+    "plt.xlabel(\"No particle probability\")\n",
+    "plt.title(\"HCAL clusters\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "revised-pollution",
+   "id": "equipped-subcommittee",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sklearn.metrics.balanced_accuracy_score(ycand_f[msk_X_f, 0], ypred_id_f[msk_X_f])"
+    "elem_type = 5\n",
+    "icls = 2\n",
+    "\n",
+    "def plot_elem_energy_cls_prob(elem_type):\n",
+    "    plt.figure(figsize=(4*5,2*4))\n",
+    "    plt.suptitle(\"PFElement type {}\".format(elem_type))\n",
+    "    \n",
+    "    for icls in range(8):\n",
+    "        plt.subplot(2,4,icls+1)\n",
+    "        plt.hist2d(\n",
+    "            np.log(X_f[X_f[:, 0]==elem_type, 4]),\n",
+    "            ypred_raw_f[X_f[:, 0]==elem_type, icls],\n",
+    "            bins=(np.linspace(-2,6,100), np.linspace(0,1,100)), cmap=\"Blues\");\n",
+    "        plt.colorbar()\n",
+    "        plt.xlabel(\"PFElement log[E/GeV]\")\n",
+    "        plt.ylabel(\"MLPF probability for class {}\".format(icls))\n",
+    "    plt.tight_layout()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "virgin-nicaragua",
+   "id": "worst-coating",
    "metadata": {},
    "outputs": [],
    "source": [
-    "for icls in range(1,8):\n",
-    "    npred = np.sum(ypred_id == icls, axis=1)\n",
-    "    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)\n",
-    "    plt.figure(figsize=(6,6))\n",
-    "    plt.scatter(ncand, npred, marker=\".\", alpha=0.8)\n",
-    "    a = 0.5*min(np.min(npred), np.min(ncand))\n",
-    "    b = 1.5*max(np.max(npred), np.max(ncand))\n",
-    "    plt.xlim(a,b)\n",
-    "    plt.ylim(a,b)\n",
-    "    plt.plot([a,b],[a,b], color=\"black\", ls=\"--\")\n",
-    "    plt.title(pid_names_long[icls],y=1.05)\n",
-    "    plt.xlabel(\"number of PFCandidates\")\n",
-    "    plt.ylabel(\"number of MLPFCandidates\")\n",
-    "    cms_label(x2=0.6, y=0.89)\n",
-    "    plt.savefig(\"num_cls{}.pdf\".format(icls))\n"
+    "plot_elem_energy_cls_prob(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "responsible-georgia",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "noble-guess",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "honest-tackle",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reco_label = X_f[X_f[:, 0]!=0, 0]\n",
+    "cand_label = ycand_f[X_f[:, 0]!=0, 0]\n",
+    "pred_label = ypred_id_f[X_f[:, 0]!=0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fourth-approval",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cm1 = sklearn.metrics.confusion_matrix(reco_label, cand_label, labels=range(12))\n",
+    "cm2 = sklearn.metrics.confusion_matrix(reco_label, pred_label, labels=range(12))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abroad-wallpaper",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(cm1, cmap=\"Blues\", norm=matplotlib.colors.LogNorm())\n",
+    "plt.xticks(range(12));\n",
+    "plt.yticks(range(12));\n",
+    "plt.colorbar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "honest-runner",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(cm2, cmap=\"Blues\", norm=matplotlib.colors.LogNorm())\n",
+    "plt.xticks(range(12));\n",
+    "plt.yticks(range(12));\n",
+    "plt.colorbar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "incorporated-prerequisite",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ycand_id_f = ycand_f[:, 0]\n",
+    "\n",
+    "b = np.linspace(-3,6,100)\n",
+    "\n",
+    "icls = 2\n",
+    "\n",
+    "def plot_eff_and_fake_rate(\n",
+    "    icls,\n",
+    "    ivar=4,\n",
+    "    bins=np.linspace(-3,6,100),\n",
+    "    xlabel=\"PFElement log[E/GeV]\", log=True\n",
+    "    ):\n",
+    "    \n",
+    "    values = X_f[:, ivar]\n",
+    "    if log:\n",
+    "        values = np.log(values)\n",
+    "        \n",
+    "    hist_cand = np.histogram(values[(ycand_id_f==icls)], bins=bins);\n",
+    "    hist_cand_true = np.histogram(values[(ycand_id_f==icls) & (ypred_id_f==icls)], bins=bins);\n",
+    "\n",
+    "    hist_pred = np.histogram(values[(ypred_id_f==icls)], bins=bins);\n",
+    "    hist_pred_fake = np.histogram(values[(ycand_id_f!=icls) & (ypred_id_f==icls)], bins=bins);\n",
+    "\n",
+    "    eff = hist_cand_true[0]/hist_cand[0]\n",
+    "    fake = hist_pred_fake[0]/hist_pred[0]\n",
+    "\n",
+    "    plt.figure(figsize=(8,8))\n",
+    "    ax = plt.subplot(2,1,1)\n",
+    "    mplhep.histplot(hist_cand, label=\"PF\")\n",
+    "    mplhep.histplot(hist_pred, label=\"MLPF\")\n",
+    "    plt.legend()\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(\"Number of particles\")\n",
+    "\n",
+    "    ax = plt.subplot(2,1,2, sharex=ax)\n",
+    "    mplhep.histplot(eff, bins=hist_cand[1], label=\"efficiency\", color=\"black\")\n",
+    "    mplhep.histplot(fake, bins=hist_cand[1], label=\"fake rate\", color=\"red\")\n",
+    "    plt.legend(frameon=False)\n",
+    "    plt.ylim(0,1.4)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(\"Fraction of particles / bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dedicated-indonesia",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_eff_and_fake_rate(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "variable-potter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_eff_and_fake_rate(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "hybrid-chuck",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_eff_and_fake_rate(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "golden-catalyst",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_eff_and_fake_rate(4)"
    ]
   },
   {
@@ -479,8 +687,8 @@
     "    msk = (ycand_f[:, 0] == icls)\n",
     "    plt.hist(ypred_raw_f[msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype=\"step\", lw=2, color=\"blue\", label=\"true \"+pid_names[icls]);\n",
     "    plt.hist(ypred_raw_f[~msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype=\"step\", lw=2, color=\"red\", label=\"other particles\");\n",
-    "    #plt.axvline(ret.x[icls-1], 0, 0.7, ls=\"--\",\n",
-    "    #    color=\"black\", label=\"threshold: {:.2f}\".format(ret.x[icls-1]), lw=1)\n",
+    "    plt.axvline(ret.x[icls-1], 0, 0.7, ls=\"--\",\n",
+    "        color=\"black\", label=\"threshold: {:.2f}\".format(ret.x[icls-1]), lw=1)\n",
     "    plt.yscale(\"log\")\n",
     "    plt.title(\"Particle reconstruction for {}\".format(pid_names[icls]), y=1.05)\n",
     "    plt.xlabel(\"Classification output {}\".format(icls))\n",
@@ -501,16 +709,16 @@
     "#perm = np.random.permutation(ycand_f[msk_X].shape[0])[:100000]\n",
     "\n",
     "cm_norm = sklearn.metrics.confusion_matrix(\n",
-    "    ycand_f[msk_X_f & (ycand_f[:, 0]!=0), 0],\n",
-    "    ypred_id_f[msk_X_f & (ycand_f[:, 0]!=0)],\n",
-    "    labels=range(1,8),\n",
+    "    ycand_f[msk_X_f, 0],\n",
+    "    ypred_id_f[msk_X_f],\n",
+    "    labels=range(0,8),\n",
     "    normalize=\"true\"\n",
     ")\n",
     "\n",
     "cm = sklearn.metrics.confusion_matrix(\n",
-    "    ycand_f[msk_X_f & (ycand_f[:, 0]!=0), 0],\n",
-    "    ypred_id_f[msk_X_f & (ycand_f[:, 0]!=0)],\n",
-    "    labels=range(1,8),\n",
+    "    ycand_f[msk_X_f, 0],\n",
+    "    ypred_id_f[msk_X_f],\n",
+    "    labels=range(0,8),\n",
     ")"
    ]
   },
@@ -528,8 +736,8 @@
     "\n",
     "cms_label(x1=0.18, x2=0.52, y=0.82)\n",
     "#sample_label(ax, x=0.8, y=1.0)\n",
-    "plt.xticks(range(len(y_labels)), y_labels);\n",
-    "plt.yticks(range(len(y_labels)), y_labels);\n",
+    "#plt.xticks(range(len(y_labels)), y_labels);\n",
+    "#plt.yticks(range(len(y_labels)), y_labels);\n",
     "plt.xlabel(\"Predicted PFCandidate\")\n",
     "plt.ylabel(\"True PFCandidate\")\n",
     "plt.title(\"MLPF trained on PF\", y=1.03)\n",
@@ -656,7 +864,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "linear-ceramic",
+   "id": "dirty-rebecca",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/notebooks/cmssw.ipynb b/notebooks/cmssw.ipynb
index da95c7bd2..be3f9855c 100644
--- a/notebooks/cmssw.ipynb
+++ b/notebooks/cmssw.ipynb
@@ -231,7 +231,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -245,7 +245,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 58578bce8..2fbcc59dc 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -23,7 +23,7 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 1.0
+  classification_loss_coef: 100.0
   charge_loss_coef: 0.01
   pt_loss_coef: 0.0001
   eta_loss_coef: 100.0
@@ -62,13 +62,13 @@ setup:
   num_epochs: 100
   num_val_files: 10
   dtype: float32
-  trainable:
-  classification_loss_type: categorical_cross_entropy
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy #categorical_cross_entropy, sigmoid_focal_crossentropy 
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 sample_weights:
-  cls: inverse_sqrt
-  charge: signal_only
+  cls: none
+  charge: none
   pt: signal_only
   eta: signal_only
   sin_phi: signal_only
@@ -78,6 +78,10 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
   combined_graph_layer:
     bin_size: 640
     max_num_bins: 100
@@ -97,7 +101,7 @@ parameters:
       normalize_degrees: yes
     hidden_dim: 128
     activation: gelu
-  num_graph_layers_common: 3
+  num_graph_layers_common: 4
   num_graph_layers_energy: 3
   output_decoding:
     activation: gelu
@@ -108,14 +112,14 @@ parameters:
     eta_skip_gate: yes
     phi_skip_gate: yes
 
-    id_dim_decrease: yes
+    id_dim_decrease: no
     charge_dim_decrease: yes
     pt_dim_decrease: yes
     eta_dim_decrease: yes
     phi_dim_decrease: yes
     energy_dim_decrease: yes
 
-    id_hidden_dim: 256
+    id_hidden_dim: 512
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 07bfb8d08..7341bc09e 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -78,6 +78,10 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
   combined_graph_layer:
     bin_size: 160
     max_num_bins: 100

From cb79eea640d2b03d43b9ad917e7e10ce1f27782e Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Wed, 1 Sep 2021 14:41:46 +0200
Subject: [PATCH 142/157] fix: add missing settings to delphes.yaml

---
 parameters/delphes.yaml | 82 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 10f04eb57..902cff14b 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -55,6 +55,29 @@ setup:
   trainable:
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+  optimizer: adam  # adam, adamw, sgd
+
+optimizer:
+  adam:
+    amsgrad: no
+  adamw:
+    amsgrad: yes
+    weight_decay: 0.001
+  sgd:
+    nesterov: no
+    momentum: 0.9
+
+# LR Schedules
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
+onecycle:
+  mom_min: 0.85
+  mom_max: 0.95
+  warmup_ratio: 0.3
+  div_factor: 25.0
+  final_div: 100000.0
 
 sample_weights:
   cls: inverse_sqrt
@@ -131,3 +154,62 @@ exponentialdecay:
   decay_steps: 10000
   decay_rate: 0.99
   staircase: yes
+
+callbacks:
+  checkpoint:
+    save_weights_only: yes
+    monitor: "val_loss"
+    save_best_only: no
+  plot_freq: 10
+  tensorboard:
+    dump_history: yes
+    hist_freq: 1
+
+hypertune:
+  algorithm: hyperband  # random, bayesian, hyperband
+  random:
+    objective: val_loss
+    max_trials: 100
+  bayesian:
+    objective: val_loss
+    max_trials: 100
+    num_initial_points: 2
+  hyperband:
+    objective: val_loss
+    max_epochs: 100
+    factor: 3
+    iterations: 1
+    executions_per_trial: 1
+
+raytune:
+  local_dir:  # Note: please specify an absolute path
+  sched: "asha"  # asha, hyperband
+  parameters:
+    # optimizer parameters
+    lr: [1e-4]
+    batch_size: [32]
+    expdecay_decay_steps: [10000]
+    # model parameters
+    combined_graph_layer:
+      layernorm: [False]
+      hidden_dim: [64, 128, 256]
+      distance_dim: [128, 256]
+      num_node_messages: [1]
+      node_message:
+        normalize_degrees: [True]
+        output_dim: [64, 128, 256]
+      dropout: [0.0]
+      bin_size: [80, 160, 320]
+      kernel:
+        clip_value_low: [0.0]
+    num_graph_layers_common: [2, 3, 4]
+    num_graph_layers_energy: [2, 3, 4]
+  # Tune schedule specific parameters
+  asha:
+    max_t: 100
+    reduction_factor: 3
+    brackets: 1
+    grace_period: 5
+  hyperband:
+    max_t: 100
+    reduction_factor: 3

From 7c49878672fb6d420a93253ce0e193139ee0348d Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Wed, 1 Sep 2021 15:36:07 +0200
Subject: [PATCH 143/157] feat: add option to resume a raytune run

---
 mlpf/pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 4c1f7e60d..a34aed31d 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -671,7 +671,8 @@ def plot_ray_analysis(analysis, save=False, skip=0):
 @click.option("--cpus", help="number of cpus per worker", type=int, default=1)
 @click.option("--gpus", help="number of gpus per worker", type=int, default=0)
 @click.option("--tune_result_dir", help="Tune result dir", type=str, default=None)
-def raytune(config, name, local, cpus, gpus, tune_result_dir):
+@click.option("-r", "--resume", help="resume run from local_dir", is_flag=True)
+def raytune(config, name, local, cpus, gpus, tune_result_dir, resume):
     cfg = load_config(config)
     config_file_path = config
 
@@ -723,6 +724,7 @@ def raytune(config, name, local, cpus, gpus, tune_result_dir):
         local_dir=cfg["raytune"]["local_dir"],
         callbacks=[TBXLoggerCallback()],
         log_to_file=True,
+        resume=resume,
     )
     print("Best hyperparameters found were: ", analysis.get_best_config("val_loss", "min"))
 

From bc0c94013e36b811545430a200f452b1e93f5461 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 1 Sep 2021 17:50:01 +0300
Subject: [PATCH 144/157] lsh configurable

---
 mlpf/tfmodel/model.py       |  68 ++++++++++++---
 mlpf/tfmodel/model_setup.py |  18 ++--
 notebooks/cms-mlpf.ipynb    | 161 ++++++++++++++++++++++++++----------
 parameters/cms-dev.yaml     |  17 ++--
 4 files changed, 193 insertions(+), 71 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index d835deab1..d0dbd85a2 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -392,6 +392,33 @@ def call(self, x_msg, x_node, msk, training=False):
 
         return bins_split, x_features_binned, dm, msk_f_binned
 
+class MessageBuildingLayerFull(tf.keras.layers.Layer):
+    def __init__(self, distance_dim=128, kernel=NodePairGaussianKernel(), **kwargs):
+        self.distance_dim = distance_dim
+        self.kernel = kernel
+
+        super(MessageBuildingLayerFull, self).__init__(**kwargs)
+    
+    """
+    x_msg: (n_batch, n_points, n_msg_features)
+    """
+    def call(self, x_msg, msk, training=False):
+        msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
+
+        shp = tf.shape(x_msg)
+        n_batches = shp[0]
+        n_points = shp[1]
+        n_message_features = shp[2]
+
+        #Run the node-to-node kernel (distance computation / graph building / attention)
+        dm = self.kernel(x_msg, training=training)
+
+        #remove the masked points row-wise and column-wise
+        dm = tf.einsum("bijk,bi->bijk", dm, tf.squeeze(msk_f, axis=-1))
+        dm = tf.einsum("bijk,bj->bijk", dm, tf.squeeze(msk_f, axis=-1))
+
+        return dm
+
 class OutputDecoding(tf.keras.Model):
     def __init__(self,
         activation="elu",
@@ -624,6 +651,7 @@ def __init__(self, *args, **kwargs):
         self.kernel = kwargs.pop("kernel")
         self.node_message = kwargs.pop("node_message")
         self.hidden_dim = kwargs.pop("hidden_dim")
+        self.do_lsh = kwargs.pop("do_lsh")
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
         self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation"))
 
@@ -638,12 +666,20 @@ def __init__(self, *args, **kwargs):
             num_layers=2, activation=self.activation,
             dropout=self.dropout
         )
-        self.message_building_layer = MessageBuildingLayerLSH(
-            distance_dim=self.distance_dim,
-            max_num_bins=self.max_num_bins,
-            bin_size=self.bin_size,
-            kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
-        )
+
+        if self.do_lsh:
+            self.message_building_layer = MessageBuildingLayerLSH(
+                distance_dim=self.distance_dim,
+                max_num_bins=self.max_num_bins,
+                bin_size=self.bin_size,
+                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+            )
+        else:
+            self.message_building_layer = MessageBuildingLayerFull(
+                distance_dim=self.distance_dim,
+                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+            )
+
         self.message_passing_layers = [
             get_message_layer(self.node_message, "{}_msg_{}".format(kwargs.get("name"), iconv)) for iconv in range(self.num_node_messages)
         ]
@@ -662,21 +698,29 @@ def call(self, x, msk, training=False):
         x_dist = self.dist_activation(self.ffn_dist(x, training=training))
 
         #x_dist = self.gaussian_noise(x_dist, training=training)
+
         #compute the element-to-element messages / distance matrix / graph structure
-        bins_split, x_binned, dm, msk_binned = self.message_building_layer(x_dist, x, msk)
+        if self.do_lsh:
+            bins_split, x, dm, msk_f = self.message_building_layer(x_dist, x, msk)
+        else:
+            dm = self.message_building_layer(x_dist, msk)
+            msk_f = tf.expand_dims(tf.cast(msk, x.dtype), axis=-1)
+            bins_split = None
 
         #run the node update with message passing
         for msg in self.message_passing_layers:
-            x_binned = msg((x_binned, dm, msk_binned))
+            x = msg((x, dm, msk_f))
 
-            #x_binned = self.gaussian_noise(x_binned, training=training)
+            #x = self.gaussian_noise(x, training=training)
 
             if self.dropout_layer:
-                x_binned = self.dropout_layer(x_binned, training=training)
+                x = self.dropout_layer(x, training=training)
 
-        x_enc = reverse_lsh(bins_split, x_binned)
+        #undo the binning according to the element-to-bin indices
+        if self.do_lsh:
+            x = reverse_lsh(bins_split, x)
 
-        return {"enc": x_enc, "dist": x_dist, "bins": bins_split, "dm": dm}
+        return {"enc": x, "dist": x_dist, "bins": bins_split, "dm": dm}
 
 class PFNetDense(tf.keras.Model):
     def __init__(self,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index a44d60a98..b47822dde 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -324,16 +324,17 @@ def plot_eff_and_fake_rate(
         ypred_id,
         cp_dir,
         ivar=4,
-        bins=np.linspace(-3,6,100),
-        xlabel="PFElement log[E/GeV]",
-        log=True
+        bins=np.linspace(0, 200, 100),
+        xlabel="PFElement E",
+        log_var=False,
+        do_log_y=True
         ):
         
         values = self.X[msk][:, ivar]
         cand_id = self.ytrue_id[msk]
         pred_id = ypred_id[msk]
 
-        if log:
+        if log_var:
             values = np.log(values)
             
         hist_cand = np.histogram(values[(cand_id==icls)], bins=bins);
@@ -352,12 +353,14 @@ def plot_eff_and_fake_rate(
         plt.legend()
         plt.xlabel(xlabel)
         plt.ylabel("Number of particles")
+        if do_log_y:
+            ax.set_yscale("log")
 
         ax = plt.subplot(2,1,2, sharex=ax)
         mplhep.histplot(eff, bins=hist_cand[1], label="efficiency", color="black")
         mplhep.histplot(fake, bins=hist_cand[1], label="fake rate", color="red")
         plt.legend(frameon=False)
-        plt.ylim(0,1.4)
+        plt.ylim(0, 1.4)
         plt.xlabel(xlabel)
         plt.ylabel("Fraction of particles / bin")
 
@@ -503,7 +506,10 @@ def make_gnn_dense(config, dtype):
         "debug"
     ]
 
-    kwargs = {par: config['parameters'][par] for par in parameters}
+    kwargs = {}
+    for par in parameters:
+        if par in config['parameters'].keys():
+            kwargs[par] = config['parameters'][par]
 
     model = PFNetDense(
         multi_output=config["setup"]["multi_output"],
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index c78738457..4424b3211 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -17,7 +17,7 @@
     "import sklearn.metrics\n",
     "import matplotlib\n",
     "import scipy\n",
-    "import mplhep as hep\n",
+    "import mplhep\n",
     "\n",
     "import pandas"
    ]
@@ -130,7 +130,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms-dev_20210831_225815_541048.gpu0.local/evaluation/\""
+    "path = \"../experiments/cms-dev_20210901_112919_500542.gpu0.local/evaluation/\""
    ]
   },
   {
@@ -172,10 +172,22 @@
     "\n",
     "# ypred_raw[X[:, :, 0]==4, 1] = 0.0\n",
     "# #ypred_raw[X[:, :, 0]==4, 6] = 0.0\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==5, 0] += ypred_raw[X[:, :, 0]==5, 1]\n",
+    "# ypred_raw[X[:, :, 0]==5, 0] += ypred_raw[X[:, :, 0]==5, 7]\n",
     "# ypred_raw[X[:, :, 0]==5, 1] = 0.0\n",
     "# ypred_raw[X[:, :, 0]==5, 7] = 0.0\n",
     "\n",
+    "# ypred_raw[X[:, :, 0]==8, 3] += ypred_raw[X[:, :, 0]==8, 1]\n",
+    "# ypred_raw[X[:, :, 0]==8, 3] += ypred_raw[X[:, :, 0]==8, 2]\n",
     "# ypred_raw[X[:, :, 0]==8, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==8, 2] = 0.0\n",
+    "\n",
+    "\n",
+    "# ypred_raw[X[:, :, 0]==9, 3] += ypred_raw[X[:, :, 0]==9, 1]\n",
+    "# ypred_raw[X[:, :, 0]==9, 3] += ypred_raw[X[:, :, 0]==9, 2]\n",
+    "# ypred_raw[X[:, :, 0]==9, 1] = 0.0\n",
+    "# ypred_raw[X[:, :, 0]==9, 2] = 0.0\n",
     "# ypred_raw[X[:, :, 0]==9, 1] = 0.0\n",
     "\n",
     "# ypred_raw[X[:, :, 0]==8, 2] = 0.0\n",
@@ -184,14 +196,13 @@
     "ypred_raw_f = ypred_raw.reshape((ypred_raw.shape[0]*ypred_raw.shape[1], ypred_raw.shape[2]))\n",
     "\n",
     "ypred_id = np.argmax(ypred_raw, axis=-1)\n",
-    "\n",
     "ypred_id_f = ypred_id.flatten()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "floral-people",
+   "id": "corrected-tunisia",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,13 +212,52 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cooked-bullet",
+   "id": "extensive-kuwait",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "thresholds = [0.0, 0.0, 0.0, 0, 0, 0, 0]\n",
+    "ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
+    "ypred_id_f = apply_thresholds_f(ypred_raw_f, thresholds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "interim-chosen",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icls = 2\n",
+    "ielem = 5\n",
+    "\n",
+    "energy_msk = (X_f[:, 4]>0)\n",
+    "elem_msk = (X_f[:, 0]==ielem)\n",
+    "\n",
+    "vals_sig = ypred_raw_f[energy_msk & elem_msk & (ycand_f[:, 0]==icls), icls]\n",
+    "vals_bkg = ypred_raw_f[energy_msk & elem_msk & (ycand_f[:, 0]!=icls), icls]\n",
+    "hsig = np.histogram(vals_sig, bins=b)[0]\n",
+    "hbkg = np.histogram(vals_bkg, bins=b)[0]\n",
+    "\n",
+    "a = np.cumsum(hsig)/np.sum(hsig)\n",
+    "b = np.cumsum(hbkg)/np.sum(hbkg)\n",
+    "\n",
+    "plt.figure(figsize=(4,4))\n",
+    "plt.plot(a, b, marker=\".\")\n",
+    "plt.plot([0,1], [0,1], color=\"black\", lw=0.5, ls=\"--\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "becoming-application",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# thresholds = [0.6, 0.7, 0.0, 0, 0, 0, 0]\n",
-    "# ypred_id = apply_thresholds(ypred_raw, thresholds)\n",
-    "# ypred_id_f = apply_thresholds_f(ypred_raw_f, thresholds)"
+    "b = np.linspace(0,1,100)\n",
+    "mplhep.histplot(np.histogram(vals_sig, bins=b, density=1), label=\"sig\");\n",
+    "mplhep.histplot(np.histogram(vals_bkg, bins=b, density=1), label=\"bkg\");\n",
+    "plt.legend(loc=2)"
    ]
   },
   {
@@ -237,7 +287,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "reported-button",
+   "id": "funky-destination",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -265,7 +315,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "chronic-discovery",
+   "id": "authorized-greensboro",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -281,7 +331,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "multiple-disco",
+   "id": "incorporate-vanilla",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -297,7 +347,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "innocent-black",
+   "id": "comic-privacy",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -313,7 +363,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "flying-mason",
+   "id": "sustainable-passage",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -329,7 +379,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "equipped-subcommittee",
+   "id": "funny-batch",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -343,9 +393,9 @@
     "    for icls in range(8):\n",
     "        plt.subplot(2,4,icls+1)\n",
     "        plt.hist2d(\n",
-    "            np.log(X_f[X_f[:, 0]==elem_type, 4]),\n",
+    "            np.log10(X_f[X_f[:, 0]==elem_type, 4]),\n",
     "            ypred_raw_f[X_f[:, 0]==elem_type, icls],\n",
-    "            bins=(np.linspace(-2,6,100), np.linspace(0,1,100)), cmap=\"Blues\");\n",
+    "            bins=(np.linspace(-2,4,100), np.linspace(0,1,100)), cmap=\"Blues\");\n",
     "        plt.colorbar()\n",
     "        plt.xlabel(\"PFElement log[E/GeV]\")\n",
     "        plt.ylabel(\"MLPF probability for class {}\".format(icls))\n",
@@ -355,7 +405,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "worst-coating",
+   "id": "strange-combine",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -365,27 +415,27 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "responsible-georgia",
+   "id": "private-communication",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_elem_energy_cls_prob(5)"
+    "plot_elem_energy_cls_prob(4)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "noble-guess",
+   "id": "differential-steal",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_elem_energy_cls_prob(4)"
+    "plot_elem_energy_cls_prob(5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "honest-tackle",
+   "id": "direct-crowd",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -397,7 +447,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fourth-approval",
+   "id": "fitting-thriller",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -408,7 +458,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "abroad-wallpaper",
+   "id": "frozen-ethnic",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -421,7 +471,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "honest-runner",
+   "id": "anticipated-robinson",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -434,7 +484,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "incorporated-prerequisite",
+   "id": "micro-saying",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -465,60 +515,83 @@
     "    fake = hist_pred_fake[0]/hist_pred[0]\n",
     "\n",
     "    plt.figure(figsize=(8,8))\n",
-    "    ax = plt.subplot(2,1,1)\n",
-    "    mplhep.histplot(hist_cand, label=\"PF\")\n",
-    "    mplhep.histplot(hist_pred, label=\"MLPF\")\n",
+    "    ax1 = plt.subplot(2,1,1)\n",
+    "    mplhep.histplot(hist_cand, label=\"with PF candidate\")\n",
+    "    mplhep.histplot(hist_pred, label=\"with MLPF candidate\")\n",
     "    plt.legend()\n",
     "    plt.xlabel(xlabel)\n",
     "    plt.ylabel(\"Number of particles\")\n",
     "\n",
-    "    ax = plt.subplot(2,1,2, sharex=ax)\n",
+    "    ax2 = plt.subplot(2,1,2, sharex=ax1)\n",
     "    mplhep.histplot(eff, bins=hist_cand[1], label=\"efficiency\", color=\"black\")\n",
     "    mplhep.histplot(fake, bins=hist_cand[1], label=\"fake rate\", color=\"red\")\n",
     "    plt.legend(frameon=False)\n",
     "    plt.ylim(0,1.4)\n",
     "    plt.xlabel(xlabel)\n",
-    "    plt.ylabel(\"Fraction of particles / bin\")"
+    "    plt.ylabel(\"Fraction of particles / bin\")\n",
+    "    \n",
+    "    return ax1, ax2"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dedicated-indonesia",
+   "id": "inner-christianity",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_eff_and_fake_rate(1)"
+    "b = np.linspace(0,100, 100)\n",
+    "plt.hist(X_f[(X_f[:, 0]==5), 4], bins=b, histtype=\"step\", lw=2, label=\"all clusters\");\n",
+    "plt.hist(X_f[(X_f[:, 0]==5) & (ycand_f[:, 0]==2), 4], bins=b, histtype=\"step\", lw=2, label=\"with PF candidate\");\n",
+    "plt.hist(X_f[(X_f[:, 0]==5) & (ypred_id_f==2), 4], bins=b, histtype=\"step\", lw=2, label=\"with MLPF candidate\");\n",
+    "plt.yscale(\"log\")\n",
+    "plt.legend()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "variable-potter",
+   "id": "automated-quarter",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_eff_and_fake_rate(2)"
+    "ax1, ax2 = plot_eff_and_fake_rate(1, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")\n",
+    "ax1.set_title(\"track, charged hadron predictions\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "hybrid-chuck",
+   "id": "military-professor",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_eff_and_fake_rate(3)"
+    "ax1, ax2 = plot_eff_and_fake_rate(2, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")\n",
+    "ax1.set_title(\"HCAL cluster, neutral hadron predictions\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "golden-catalyst",
+   "id": "characteristic-colleague",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_eff_and_fake_rate(4)"
+    "ax1, ax2 = plot_eff_and_fake_rate(3, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "composed-principal",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax1, ax2 = plot_eff_and_fake_rate(4, bins=np.linspace(0, 300, 100), log=False)\n",
+    "ax1.set_yscale(\"log\")"
    ]
   },
   {
@@ -687,8 +760,6 @@
     "    msk = (ycand_f[:, 0] == icls)\n",
     "    plt.hist(ypred_raw_f[msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype=\"step\", lw=2, color=\"blue\", label=\"true \"+pid_names[icls]);\n",
     "    plt.hist(ypred_raw_f[~msk & (X_f[:, 0] != 0), icls], bins=100, density=1, histtype=\"step\", lw=2, color=\"red\", label=\"other particles\");\n",
-    "    plt.axvline(ret.x[icls-1], 0, 0.7, ls=\"--\",\n",
-    "        color=\"black\", label=\"threshold: {:.2f}\".format(ret.x[icls-1]), lw=1)\n",
     "    plt.yscale(\"log\")\n",
     "    plt.title(\"Particle reconstruction for {}\".format(pid_names[icls]), y=1.05)\n",
     "    plt.xlabel(\"Classification output {}\".format(icls))\n",
@@ -754,13 +825,13 @@
    "source": [
     "plt.figure(figsize=(8, 8))\n",
     "ax = plt.axes()\n",
-    "plt.imshow(cm, cmap=\"Blues\", norm=matplotlib.colors.LogNorm())\n",
+    "plt.imshow(cm, cmap=\"Blues\")\n",
     "plt.colorbar()\n",
     "\n",
     "cms_label(x1=0.18, x2=0.52, y=0.82)\n",
     "#sample_label(ax, x=0.8, y=1.0)\n",
-    "plt.xticks(range(len(y_labels)), y_labels);\n",
-    "plt.yticks(range(len(y_labels)), y_labels);\n",
+    "#plt.xticks(range(len(y_labels)), y_labels);\n",
+    "#plt.yticks(range(len(y_labels)), y_labels);\n",
     "plt.xlabel(\"Predicted PFCandidate\")\n",
     "plt.ylabel(\"True PFCandidate\")\n",
     "plt.title(\"MLPF trained on PF\", y=1.03)\n",
@@ -864,7 +935,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dirty-rebecca",
+   "id": "scheduled-worst",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 2fbcc59dc..0e5cc101f 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -56,7 +56,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-3
-  batch_size: 5
+  batch_size: 2
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 100
@@ -83,8 +83,9 @@ parameters:
   dropout: 0.0
   activation: gelu
   combined_graph_layer:
-    bin_size: 640
-    max_num_bins: 100
+    do_lsh: no
+    bin_size: 1600
+    max_num_bins: 10
     distance_dim: 128
     layernorm: no
     dropout: 0.0
@@ -93,16 +94,16 @@ parameters:
       type: NodePairGaussianKernel
       dist_mult: 0.1
       clip_value_low: 0.0
-    num_node_messages: 1
+    num_node_messages: 2
     node_message:
       type: GHConvDense
-      output_dim: 128
+      output_dim: 512
       activation: gelu
       normalize_degrees: yes
-    hidden_dim: 128
+    hidden_dim: 256
     activation: gelu
-  num_graph_layers_common: 4
-  num_graph_layers_energy: 3
+  num_graph_layers_common: 1
+  num_graph_layers_energy: 1
   output_decoding:
     activation: gelu
     regression_use_classification: yes

From 4a60f46468b346a98d7de965599579f52805ee94 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 2 Sep 2021 09:51:46 +0300
Subject: [PATCH 145/157] added LSH scanning

---
 mlpf/tallinn/test-gnn.sh              |  14 +++
 mlpf/tfmodel/model_setup.py           |   9 +-
 parameters/cms.yaml                   |   3 +-
 parameters/test-gnn/cms-0l.yaml       | 149 ++++++++++++++++++++++++++
 parameters/test-gnn/cms-lsh-1l.yaml   | 149 ++++++++++++++++++++++++++
 parameters/test-gnn/cms-lsh-2l.yaml   | 149 ++++++++++++++++++++++++++
 parameters/test-gnn/cms-lsh-3l.yaml   | 149 ++++++++++++++++++++++++++
 parameters/test-gnn/cms-nolsh-1l.yaml | 149 ++++++++++++++++++++++++++
 8 files changed, 768 insertions(+), 3 deletions(-)
 create mode 100755 mlpf/tallinn/test-gnn.sh
 create mode 100644 parameters/test-gnn/cms-0l.yaml
 create mode 100644 parameters/test-gnn/cms-lsh-1l.yaml
 create mode 100644 parameters/test-gnn/cms-lsh-2l.yaml
 create mode 100644 parameters/test-gnn/cms-lsh-3l.yaml
 create mode 100644 parameters/test-gnn/cms-nolsh-1l.yaml

diff --git a/mlpf/tallinn/test-gnn.sh b/mlpf/tallinn/test-gnn.sh
new file mode 100755
index 000000000..15017b885
--- /dev/null
+++ b/mlpf/tallinn/test-gnn.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH -p gpu
+#SBATCH --gpus 1
+#SBATCH --mem-per-gpu=8G
+
+IMG=/home/software/singularity/base.simg:latest
+cd ~/particleflow
+
+#TF training
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-0l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-lsh-1l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-lsh-2l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-lsh-3l.yaml --plot-freq 10
+singularity exec --nv $IMG python3 mlpf/pipeline.py train -c parameters/test-gnn/cms-nolsh-1l.yaml --plot-freq 10
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index b47822dde..aef53ee6e 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -193,6 +193,8 @@ def plot_reg_distribution(self, epoch, outpath, ypred, ypred_id, icls, reg_varia
         bins = self.reg_bins[reg_variable]
         if bins is None:
             bins = 100
+
+        plt.figure()
         plt.hist(vals_true, bins=bins, histtype="step", lw=2, label="true")
         plt.hist(vals_pred, bins=bins, histtype="step", lw=2, label="predicted")
 
@@ -312,6 +314,7 @@ def plot_elem_to_pred(self, epoch, cp_dir, msk, ypred_id):
 
         image_path = str(cp_dir / "elem_to_pred.png")
         plt.savefig(image_path, bbox_inches="tight")
+        plt.close("all")
 
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
@@ -366,6 +369,7 @@ def plot_eff_and_fake_rate(
 
         image_path = str(cp_dir / "eff_fake_cls{}.png".format(icls))
         plt.savefig(image_path, bbox_inches="tight")
+        plt.close("all")
 
         if self.comet_experiment:
             self.comet_experiment.log_image(image_path, step=epoch)
@@ -376,8 +380,9 @@ def on_epoch_end(self, epoch, logs=None):
         with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
             json.dump(logs, fi)
 
-        if epoch%self.plot_freq!=0:
-            return
+        if self.plot_freq>1:
+            if (epoch+1)%self.plot_freq!=0 or epoch==0:
+                return
 
         cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)
         cp_dir.mkdir(parents=True, exist_ok=True)
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 7341bc09e..61e0b6ba3 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -55,7 +55,7 @@ setup:
   train: yes
   weights:
   weights_config:
-  lr: 1e-4
+  lr: 1e-3
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
@@ -83,6 +83,7 @@ parameters:
   dropout: 0.0
   activation: gelu
   combined_graph_layer:
+    do_lsh: yes
     bin_size: 160
     max_num_bins: 100
     distance_dim: 128
diff --git a/parameters/test-gnn/cms-0l.yaml b/parameters/test-gnn/cms-0l.yaml
new file mode 100644
index 000000000..3e230c7cb
--- /dev/null
+++ b/parameters/test-gnn/cms-0l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 20
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: no
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 0
+  num_graph_layers_energy: 0
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-1l.yaml b/parameters/test-gnn/cms-lsh-1l.yaml
new file mode 100644
index 000000000..bdf62a034
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-1l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 10
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 1
+  num_graph_layers_energy: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-2l.yaml b/parameters/test-gnn/cms-lsh-2l.yaml
new file mode 100644
index 000000000..69320ceba
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-2l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 5
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-lsh-3l.yaml b/parameters/test-gnn/cms-lsh-3l.yaml
new file mode 100644
index 000000000..5cf0226c0
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-3l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 5
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 3
+  num_graph_layers_energy: 3
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes
diff --git a/parameters/test-gnn/cms-nolsh-1l.yaml b/parameters/test-gnn/cms-nolsh-1l.yaml
new file mode 100644
index 000000000..edb43d666
--- /dev/null
+++ b/parameters/test-gnn/cms-nolsh-1l.yaml
@@ -0,0 +1,149 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 2
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable:
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: no
+    bin_size: 160
+    max_num_bins: 100
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: gelu
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 1
+    node_message:
+      type: GHConvDense
+      output_dim: 128
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 128
+    activation: gelu
+  num_graph_layers_common: 1
+  num_graph_layers_energy: 1
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes

From 63446fd09ce4ed6d773b68e331b628f01935b881 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 2 Sep 2021 14:46:33 +0300
Subject: [PATCH 146/157] up

---
 mlpf/tfmodel/model.py   | 10 ++++++----
 parameters/cms-dev.yaml | 44 ++++++++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index d0dbd85a2..08200745f 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -37,10 +37,12 @@ def pairwise_gaussian_dist(A, B):
 def pairwise_learnable_dist(A, B, ffn, training=False):
     shp = tf.shape(A)
 
+    # tf.print("shp", shp)
+    # import pdb;pdb.set_trace()
     #stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
-    a, b, c, d = tf.meshgrid(tf.range(shp[0]), tf.range(shp[1]), tf.range(shp[2]), tf.range(shp[2]), indexing="ij")
-    inds1 = tf.stack([a,b,c], axis=-1)
-    inds2 = tf.stack([a,b,d], axis=-1)
+    mg = tf.meshgrid(tf.range(shp[0]), tf.range(shp[1]), tf.range(shp[2]), tf.range(shp[2]), indexing="ij")
+    inds1 = tf.stack([mg[0],mg[1],mg[2]], axis=-1)
+    inds2 = tf.stack([mg[0],mg[1],mg[3]], axis=-1)
     res = tf.concat([
         tf.gather_nd(A, inds1),
         tf.gather_nd(B, inds2)], axis=-1
@@ -651,7 +653,7 @@ def __init__(self, *args, **kwargs):
         self.kernel = kwargs.pop("kernel")
         self.node_message = kwargs.pop("node_message")
         self.hidden_dim = kwargs.pop("hidden_dim")
-        self.do_lsh = kwargs.pop("do_lsh")
+        self.do_lsh = kwargs.pop("do_lsh", True)
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
         self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation"))
 
diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 0e5cc101f..2b7781b75 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -63,12 +63,12 @@ setup:
   num_val_files: 10
   dtype: float32
   trainable: classification
-  classification_loss_type: categorical_cross_entropy #categorical_cross_entropy, sigmoid_focal_crossentropy 
+  classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 sample_weights:
-  cls: none
-  charge: none
+  cls: inverse_sqrt
+  charge: signal_only
   pt: signal_only
   eta: signal_only
   sin_phi: signal_only
@@ -83,27 +83,31 @@ parameters:
   dropout: 0.0
   activation: gelu
   combined_graph_layer:
-    do_lsh: no
-    bin_size: 1600
-    max_num_bins: 10
+    do_lsh: yes
+    bin_size: 32
+    max_num_bins: 500
     distance_dim: 128
     layernorm: no
     dropout: 0.0
     dist_activation: linear
     kernel:
-      type: NodePairGaussianKernel
-      dist_mult: 0.1
-      clip_value_low: 0.0
-    num_node_messages: 2
+      type: NodePairTrainableKernel
+      output_dim: 8
+      hidden_dim: 32
+      num_layers: 2
+      activation: gelu
     node_message:
-      type: GHConvDense
-      output_dim: 512
+      type: NodeMessageLearnable
+      output_dim: 256
+      hidden_dim: 128
+      num_layers: 2
       activation: gelu
-      normalize_degrees: yes
+      aggregation_direction: dst
+    num_node_messages: 1
     hidden_dim: 256
     activation: gelu
-  num_graph_layers_common: 1
-  num_graph_layers_energy: 1
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
   output_decoding:
     activation: gelu
     regression_use_classification: yes
@@ -113,21 +117,21 @@ parameters:
     eta_skip_gate: yes
     phi_skip_gate: yes
 
-    id_dim_decrease: no
+    id_dim_decrease: yes
     charge_dim_decrease: yes
     pt_dim_decrease: yes
     eta_dim_decrease: yes
     phi_dim_decrease: yes
     energy_dim_decrease: yes
 
-    id_hidden_dim: 512
+    id_hidden_dim: 256
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
     phi_hidden_dim: 256
     energy_hidden_dim: 256
 
-    id_num_layers: 3
+    id_num_layers: 2
     charge_num_layers: 2
     pt_num_layers: 2
     eta_num_layers: 2
@@ -144,6 +148,6 @@ timing:
   num_iter: 3
 
 exponentialdecay:
-  decay_steps: 1000
-  decay_rate: 0.98
+  decay_steps: 2000
+  decay_rate: 0.99
   staircase: yes

From 590a1007188b87e4e71d944792d04bbb9bc8de30 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 2 Sep 2021 14:46:43 +0300
Subject: [PATCH 147/157] epoch one-based

---
 mlpf/tfmodel/model_setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index aef53ee6e..50be3662b 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -376,12 +376,15 @@ def plot_eff_and_fake_rate(
 
     def on_epoch_end(self, epoch, logs=None):
 
+        #first epoch is 1, not 0
+        epoch = epoch + 1
+
         #save the training logs (losses) for this epoch
         with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
             json.dump(logs, fi)
 
         if self.plot_freq>1:
-            if (epoch+1)%self.plot_freq!=0 or epoch==0:
+            if epoch%self.plot_freq!=0 or epoch==1:
                 return
 
         cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)

From 32a32ec13b1091eeb108a5dfb2f984ab134b8382 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@cern.ch>
Date: Thu, 2 Sep 2021 15:17:44 +0300
Subject: [PATCH 148/157] optimization with cls only

---
 parameters/test-gnn/cms-0l.yaml       | 2 +-
 parameters/test-gnn/cms-lsh-1l.yaml   | 2 +-
 parameters/test-gnn/cms-lsh-2l.yaml   | 2 +-
 parameters/test-gnn/cms-lsh-3l.yaml   | 2 +-
 parameters/test-gnn/cms-nolsh-1l.yaml | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/parameters/test-gnn/cms-0l.yaml b/parameters/test-gnn/cms-0l.yaml
index 3e230c7cb..5977abbc6 100644
--- a/parameters/test-gnn/cms-0l.yaml
+++ b/parameters/test-gnn/cms-0l.yaml
@@ -62,7 +62,7 @@ setup:
   num_epochs: 50
   num_val_files: 20
   dtype: float32
-  trainable:
+  trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
diff --git a/parameters/test-gnn/cms-lsh-1l.yaml b/parameters/test-gnn/cms-lsh-1l.yaml
index bdf62a034..c8c4dfb7e 100644
--- a/parameters/test-gnn/cms-lsh-1l.yaml
+++ b/parameters/test-gnn/cms-lsh-1l.yaml
@@ -62,7 +62,7 @@ setup:
   num_epochs: 50
   num_val_files: 20
   dtype: float32
-  trainable:
+  trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
diff --git a/parameters/test-gnn/cms-lsh-2l.yaml b/parameters/test-gnn/cms-lsh-2l.yaml
index 69320ceba..5eb0a83f2 100644
--- a/parameters/test-gnn/cms-lsh-2l.yaml
+++ b/parameters/test-gnn/cms-lsh-2l.yaml
@@ -62,7 +62,7 @@ setup:
   num_epochs: 50
   num_val_files: 20
   dtype: float32
-  trainable:
+  trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
diff --git a/parameters/test-gnn/cms-lsh-3l.yaml b/parameters/test-gnn/cms-lsh-3l.yaml
index 5cf0226c0..6ac8b76c7 100644
--- a/parameters/test-gnn/cms-lsh-3l.yaml
+++ b/parameters/test-gnn/cms-lsh-3l.yaml
@@ -62,7 +62,7 @@ setup:
   num_epochs: 50
   num_val_files: 20
   dtype: float32
-  trainable:
+  trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
diff --git a/parameters/test-gnn/cms-nolsh-1l.yaml b/parameters/test-gnn/cms-nolsh-1l.yaml
index edb43d666..697aac9ed 100644
--- a/parameters/test-gnn/cms-nolsh-1l.yaml
+++ b/parameters/test-gnn/cms-nolsh-1l.yaml
@@ -62,7 +62,7 @@ setup:
   num_epochs: 50
   num_val_files: 20
   dtype: float32
-  trainable:
+  trainable: classification
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 

From 9a921ebeb394bfa36649896d2e546678c63f5c4a Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 2 Sep 2021 15:19:45 +0300
Subject: [PATCH 149/157] mpnn optimization

---
 parameters/cms-dev.yaml               |   4 +-
 parameters/test-gnn/cms-lsh-mpnn.yaml | 153 ++++++++++++++++++++++++++
 2 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 parameters/test-gnn/cms-lsh-mpnn.yaml

diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index 2b7781b75..b9d603c52 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -23,7 +23,7 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
-  classification_loss_coef: 100.0
+  classification_loss_coef: 1.0
   charge_loss_coef: 0.01
   pt_loss_coef: 0.0001
   eta_loss_coef: 100.0
@@ -56,7 +56,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-3
-  batch_size: 2
+  batch_size: 4
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 100
diff --git a/parameters/test-gnn/cms-lsh-mpnn.yaml b/parameters/test-gnn/cms-lsh-mpnn.yaml
new file mode 100644
index 000000000..6b4ccc9ff
--- /dev/null
+++ b/parameters/test-gnn/cms-lsh-mpnn.yaml
@@ -0,0 +1,153 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 4
+  num_events_train: 1000
+  num_events_test: 1000
+  num_epochs: 50
+  num_val_files: 20
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  do_node_encoding: no
+  hidden_dim: 128
+  dropout: 0.0
+  activation: gelu
+  combined_graph_layer:
+    do_lsh: yes
+    bin_size: 32
+    max_num_bins: 500
+    distance_dim: 128
+    layernorm: no
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairTrainableKernel
+      output_dim: 8
+      hidden_dim: 32
+      num_layers: 2
+      activation: gelu
+    node_message:
+      type: NodeMessageLearnable
+      output_dim: 256
+      hidden_dim: 128
+      num_layers: 2
+      activation: gelu
+      aggregation_direction: dst
+    num_node_messages: 1
+    hidden_dim: 256
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 256
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 2
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 1000
+  decay_rate: 0.98
+  staircase: yes

From 557ccf625e54d128db4295502b53fac672ede523 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 2 Sep 2021 16:36:47 +0300
Subject: [PATCH 150/157] up

---
 mlpf/tfmodel/model.py                 |   8 +-
 notebooks/cms-mlpf.ipynb              | 231 +++++++++++++++++---------
 parameters/cms.yaml                   |  19 +--
 parameters/test-gnn/cms-lsh-mpnn.yaml |   2 +-
 4 files changed, 169 insertions(+), 91 deletions(-)

diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 08200745f..609952079 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -241,7 +241,13 @@ def __init__(self, *args, **kwargs):
         elif self.aggregation_direction == "src":
             self.agg_dim = -3
 
-        self.ffn = point_wise_feed_forward_network(self.output_dim, self.hidden_dim, num_layers=self.num_layers, activation=self.activation, name=kwargs.get("name")+"_ffn")
+        self.ffn = point_wise_feed_forward_network(
+            self.output_dim,
+            self.hidden_dim,
+            num_layers=self.num_layers,
+            activation=self.activation,
+            name=kwargs.get("name")+"_ffn"
+        )
         super(NodeMessageLearnable, self).__init__(*args, **kwargs)
 
     def call(self, inputs):
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index 4424b3211..bfdc07ec7 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -89,6 +89,7 @@
    "outputs": [],
    "source": [
     "pid_names = {\n",
+    "    0: \"no ptcl\",\n",
     "    1: \"ch.had\",\n",
     "    2: \"n.had\",\n",
     "    3: \"HFEM\",\n",
@@ -99,6 +100,7 @@
     "}\n",
     "\n",
     "pid_names_long = {\n",
+    "    0: \"no particle\",\n",
     "    1: \"charged hadrons\",\n",
     "    2: \"neutral hadrons\",\n",
     "    3: \"HFEM\",\n",
@@ -120,7 +122,7 @@
     "x_labels = [\n",
     "    \"track\", \"PS1\", \"PS2\", \"ECAL\", \"HCAL\", \"GSF\", \"BREM\", \"HFEM\", \"HFHAD\", \"SC\", \"HO\"\n",
     "]\n",
-    "y_labels = [pid_names[i] for i in range(1,8)]"
+    "y_labels = [pid_names[i] for i in range(0,8)]"
    ]
   },
   {
@@ -130,7 +132,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms-dev_20210901_112919_500542.gpu0.local/evaluation/\""
+    "path = \"../experiments/cms-dev_20210831_225815_541048.gpu0.local/evaluation/\""
    ]
   },
   {
@@ -199,16 +201,6 @@
     "ypred_id_f = ypred_id.flatten()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "corrected-tunisia",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.unique(ypred_id[X[:, :, 0]==4], return_counts=True)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -236,8 +228,10 @@
     "\n",
     "vals_sig = ypred_raw_f[energy_msk & elem_msk & (ycand_f[:, 0]==icls), icls]\n",
     "vals_bkg = ypred_raw_f[energy_msk & elem_msk & (ycand_f[:, 0]!=icls), icls]\n",
-    "hsig = np.histogram(vals_sig, bins=b)[0]\n",
-    "hbkg = np.histogram(vals_bkg, bins=b)[0]\n",
+    "\n",
+    "bins = np.linspace(0,1,100)\n",
+    "hsig = np.histogram(vals_sig, bins=bins)[0]\n",
+    "hbkg = np.histogram(vals_bkg, bins=bins)[0]\n",
     "\n",
     "a = np.cumsum(hsig)/np.sum(hsig)\n",
     "b = np.cumsum(hbkg)/np.sum(hbkg)\n",
@@ -264,7 +258,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "virgin-nicaragua",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "for icls in range(1,8):\n",
@@ -432,6 +428,26 @@
     "plot_elem_energy_cls_prob(5)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "utility-beverage",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "moderate-india",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_elem_energy_cls_prob(9)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -518,7 +534,7 @@
     "    ax1 = plt.subplot(2,1,1)\n",
     "    mplhep.histplot(hist_cand, label=\"with PF candidate\")\n",
     "    mplhep.histplot(hist_pred, label=\"with MLPF candidate\")\n",
-    "    plt.legend()\n",
+    "    plt.legend(frameon=False)\n",
     "    plt.xlabel(xlabel)\n",
     "    plt.ylabel(\"Number of particles\")\n",
     "\n",
@@ -633,16 +649,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def loss_plot(train, test, margin=0.05):\n",
+    "def loss_plot(train, test, margin=0.05, smoothing=False):\n",
     "    fig = plt.figure(figsize=(8,4))\n",
     "    ax = plt.axes()\n",
-    "    p0 = plt.plot(train, alpha=0.2)\n",
-    "    p1 = plt.plot(test, alpha=0.2)\n",
     "    \n",
-    "    train_smooth = np.convolve(train, np.ones(5)/5, mode='valid')\n",
-    "    plt.plot(train_smooth, color=p0[0].get_color(), lw=2, label=\"train\")\n",
-    "    test_smooth = np.convolve(test, np.ones(5)/5, mode='valid')\n",
-    "    plt.plot(test_smooth, color=p1[0].get_color(), lw=2, label=\"test\")\n",
+    "    alpha = 0.2 if smoothing else 1.0\n",
+    "    l0 = None if smoothing else \"train\"\n",
+    "    l1 = None if smoothing else \"test\"\n",
+    "    p0 = plt.plot(train, alpha=alpha, label=l0)\n",
+    "    p1 = plt.plot(test, alpha=alpha, label=l1)\n",
+    "    \n",
+    "    if smoothing:\n",
+    "        train_smooth = np.convolve(train, np.ones(5)/5, mode='valid')\n",
+    "        plt.plot(train_smooth, color=p0[0].get_color(), lw=2, label=\"train\")\n",
+    "        test_smooth = np.convolve(test, np.ones(5)/5, mode='valid')\n",
+    "        plt.plot(test_smooth, color=p1[0].get_color(), lw=2, label=\"test\")\n",
     "    \n",
     "    plt.ylim(test[-1]*(1.0-margin), test[-1]*(1.0+margin))\n",
     "    plt.legend(loc=\"best\", frameon=False)\n",
@@ -658,7 +679,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"loss\"].values, history[\"val_loss\"].values)\n",
+    "p0 = loss_plot(history[\"loss\"].values, history[\"val_loss\"].values, margin=0.02)\n",
     "plt.ylabel(\"Total loss\")\n",
     "plt.savefig(\"loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -682,7 +703,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"energy_loss\"].values, history[\"val_energy_loss\"].values, margin=0.05)\n",
+    "p0 = loss_plot(history[\"energy_loss\"].values, history[\"val_energy_loss\"].values, margin=0.01)\n",
     "plt.ylabel(\"Energy loss\")\n",
     "plt.savefig(\"energy_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -694,7 +715,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"pt_loss\"].values, history[\"val_pt_loss\"].values, margin=0.1)\n",
+    "p0 = loss_plot(history[\"pt_loss\"].values, history[\"val_pt_loss\"].values, margin=0.02)\n",
     "plt.ylabel(\"$p_T$ loss\")\n",
     "plt.savefig(\"pt_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -706,7 +727,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"sin_phi_loss\"].values, history[\"val_sin_phi_loss\"].values, margin=0.01)\n",
+    "p0 = loss_plot(history[\"sin_phi_loss\"].values, history[\"val_sin_phi_loss\"].values, margin=0.02)\n",
     "plt.ylabel(\"$\\sin \\phi$ loss\")\n",
     "plt.savefig(\"sin_phi_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -730,7 +751,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "p0 = loss_plot(history[\"eta_loss\"].values, history[\"val_eta_loss\"].values, margin=0.01)\n",
+    "p0 = loss_plot(history[\"eta_loss\"].values, history[\"val_eta_loss\"].values, margin=0.005)\n",
     "plt.ylabel(\"$\\eta$ loss\")\n",
     "plt.savefig(\"eta_loss.pdf\", bbox_inches=\"tight\")"
    ]
@@ -751,7 +772,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "august-feeding",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "for icls in range(1,8):\n",
@@ -807,8 +830,8 @@
     "\n",
     "cms_label(x1=0.18, x2=0.52, y=0.82)\n",
     "#sample_label(ax, x=0.8, y=1.0)\n",
-    "#plt.xticks(range(len(y_labels)), y_labels);\n",
-    "#plt.yticks(range(len(y_labels)), y_labels);\n",
+    "plt.xticks(range(len(y_labels)), y_labels);\n",
+    "plt.yticks(range(len(y_labels)), y_labels);\n",
     "plt.xlabel(\"Predicted PFCandidate\")\n",
     "plt.ylabel(\"True PFCandidate\")\n",
     "plt.title(\"MLPF trained on PF\", y=1.03)\n",
@@ -830,8 +853,8 @@
     "\n",
     "cms_label(x1=0.18, x2=0.52, y=0.82)\n",
     "#sample_label(ax, x=0.8, y=1.0)\n",
-    "#plt.xticks(range(len(y_labels)), y_labels);\n",
-    "#plt.yticks(range(len(y_labels)), y_labels);\n",
+    "plt.xticks(range(len(y_labels)), y_labels);\n",
+    "plt.yticks(range(len(y_labels)), y_labels);\n",
     "plt.xlabel(\"Predicted PFCandidate\")\n",
     "plt.ylabel(\"True PFCandidate\")\n",
     "plt.title(\"MLPF trained on PF\", y=1.03)\n",
@@ -858,7 +881,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "expressed-samba",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "for icls in range(1,8):\n",
@@ -881,64 +906,116 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "minor-beast",
+   "id": "paperback-timeline",
    "metadata": {},
    "outputs": [],
    "source": [
-    "fig, axes = plt.subplots(7, 6, figsize=(6*6,7*5))\n",
-    "\n",
-    "for axs, icls in zip(axes, range(1,8)):    \n",
-    "    axes = axs.flatten()\n",
+    "def plot_particle_regression(ivar=6, icls=2, particle_label=\"Neutral hadrons\", log=True, minval=-1, maxval=3, norm=matplotlib.colors.LogNorm()):\n",
+    "    plt.figure(figsize=(6,5))\n",
+    "    ax = plt.axes()\n",
     "    \n",
-    "    npred = np.sum(ypred_id == icls, axis=1)\n",
-    "    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)\n",
-    "    ngen = np.sum(ygen[:, :, 0] == icls, axis=1)\n",
     "    \n",
-    "    a = 0.5*min(np.min(npred), np.min(ncand))\n",
-    "    b = 1.5*max(np.max(npred), np.max(ncand))\n",
+    "    bins = np.linspace(minval, maxval, 100)\n",
+    "    msk_both = (ypred_id_f == icls) & (ycand_f[:, 0]==icls)\n",
     "    \n",
-    "    axes[0].scatter(ncand, npred, marker=\".\")\n",
+    "    vals_true = ycand_f[msk_both, ivar]\n",
+    "    vals_pred = ypred_f[msk_both, ivar]\n",
     "    \n",
-    "    axes[0].set_xlim(a,b)\n",
-    "    axes[0].set_ylim(a,b)\n",
-    "    axes[0].plot([a,b],[a,b], color=\"black\", ls=\"--\")\n",
-    "    axes[0].set_title(pid_names[icls])\n",
-    "    axes[0].set_xlabel(\"number of PFCandidates\")\n",
-    "    axes[0].set_ylabel(\"number of MLPFCandidates\")\n",
+    "    if log:\n",
+    "        vals_true = np.log10(vals_true)\n",
+    "        vals_pred = np.log10(vals_pred)\n",
     "    \n",
-    "    msk_both = (ycand_f[:, 0]==icls) & (ypred_id_f==icls)\n",
-    "    print(icls, np.sum(msk_both))\n",
-    "\n",
-    "    for ivar, ax in zip([2,3,4,5,6], axes[1:]):\n",
-    "        \n",
-    "#         hist = np.histogram2d(\n",
-    "#             ycand_f[msk_both, ivar],\n",
-    "#             ypred_f[msk_both, ivar], bins=(bins[ivar], bins[ivar])\n",
-    "#         )\n",
-    "#         norm = matplotlib.colors.Normalize(vmin=0, vmax=max(10, np.max(hist[0])))\n",
-    "#         if ivar == 2 or ivar == 6:\n",
-    "#             norm =  matplotlib.colors.LogNorm(vmin=1, vmax=max(10, 10*np.max(hist[0])))\n",
-    "#         hep.hist2dplot(\n",
-    "#             hist, cmap=\"Blues\",\n",
-    "#             norm=norm,\n",
-    "#             ax=ax\n",
-    "#         )\n",
-    "        ax.scatter(ycand_f[msk_both, ivar], ypred_f[msk_both, ivar], marker=\".\", alpha=0.2)\n",
-    "        ax.plot([bins[ivar][0],bins[ivar][-1]], [bins[ivar][0], bins[ivar][-1]], color=\"black\", ls=\"--\")\n",
-    "        ax.set_title(\"pred. {}, {}\".format(pid_names[icls], var_names[ivar]))\n",
-    "        ax.set_xlabel(\"true value (PFCandidate)\")\n",
-    "        ax.set_ylabel(\"reconstructed value (MLPF)\")\n",
-    "plt.tight_layout()\n",
-    "plt.savefig(\"full_performance.png\", bbox_inches=\"tight\", dpi=400)"
+    "    plt.hist2d(\n",
+    "        vals_true,\n",
+    "        vals_pred,\n",
+    "        bins=(bins, bins),\n",
+    "        cmap=\"Blues\", norm=norm\n",
+    "    )\n",
+    "    \n",
+    "    plt.colorbar()\n",
+    "    plt.plot([minval, maxval], [minval, maxval], color=\"black\", ls=\"--\", lw=0.5)\n",
+    "    plt.xlim(minval, maxval)\n",
+    "    plt.ylim(minval, maxval)\n",
+    "    cms_label(x1=0.2, x2=0.48)\n",
+    "    plt.text(0.02, 0.95, particle_label, transform=ax.transAxes)\n",
+    "    ax.set_xticks(ax.get_yticks());"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "scheduled-worst",
+   "id": "ecological-toner",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "plot_particle_regression(ivar=6, icls=1, particle_label=\"Charged hadrons\")\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.savefig(\"energy_corr_cls1.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "transparent-remedy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=2, particle_label=\"Neutral hadrons\")\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.savefig(\"energy_corr_cls2.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "promotional-checklist",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=3, icls=1, particle_label=\"Charged hadrons\", log=False, minval=-4, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\eta$\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\eta$\")\n",
+    "plt.savefig(\"eta_corr_cls1.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "suitable-kansas",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=3, icls=2, particle_label=\"Neutral hadrons\", log=False, minval=-4, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\eta$\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\eta$\")\n",
+    "plt.savefig(\"eta_corr_cls2.pdf\", bbox_inches=\"tight\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "restricted-million",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=3, particle_label=\"HF\", minval=0.0, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "raising-first",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_particle_regression(ivar=6, icls=4, particle_label=\"HF\", minval=0.0, maxval=4, norm=None)\n",
+    "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
+    "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")"
+   ]
   }
  ],
  "metadata": {
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 61e0b6ba3..818762c9c 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -59,7 +59,7 @@ setup:
   batch_size: 5
   num_events_train: 80000
   num_events_test: 10000
-  num_epochs: 100
+  num_epochs: 50
   num_val_files: 10
   dtype: float32
   trainable:
@@ -78,18 +78,13 @@ sample_weights:
 parameters:
   model: gnn_dense
   input_encoding: cms
-  do_node_encoding: no
-  hidden_dim: 128
-  dropout: 0.0
-  activation: gelu
   combined_graph_layer:
-    do_lsh: yes
-    bin_size: 160
+    bin_size: 640
     max_num_bins: 100
     distance_dim: 128
     layernorm: no
     dropout: 0.0
-    dist_activation: gelu
+    dist_activation: linear
     kernel:
       type: NodePairGaussianKernel
       dist_mult: 0.1
@@ -97,7 +92,7 @@ parameters:
     num_node_messages: 1
     node_message:
       type: GHConvDense
-      output_dim: 128
+      output_dim: 256
       activation: gelu
       normalize_degrees: yes
     hidden_dim: 128
@@ -127,7 +122,7 @@ parameters:
     phi_hidden_dim: 256
     energy_hidden_dim: 256
 
-    id_num_layers: 2
+    id_num_layers: 3
     charge_num_layers: 2
     pt_num_layers: 2
     eta_num_layers: 2
@@ -145,5 +140,5 @@ timing:
 
 exponentialdecay:
   decay_steps: 1000
-  decay_rate: 0.98
-  staircase: yes
+  decay_rate: 0.99
+  staircase: yes
\ No newline at end of file
diff --git a/parameters/test-gnn/cms-lsh-mpnn.yaml b/parameters/test-gnn/cms-lsh-mpnn.yaml
index 6b4ccc9ff..291cd98a5 100644
--- a/parameters/test-gnn/cms-lsh-mpnn.yaml
+++ b/parameters/test-gnn/cms-lsh-mpnn.yaml
@@ -102,7 +102,7 @@ parameters:
       hidden_dim: 128
       num_layers: 2
       activation: gelu
-      aggregation_direction: dst
+      aggregation_direction: src
     num_node_messages: 1
     hidden_dim: 256
     activation: gelu

From df8fc6fb8f34bb9f13cd0778916c233ab3daaa73 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Thu, 2 Sep 2021 17:20:36 +0300
Subject: [PATCH 151/157] up

---
 parameters/cms-dev.yaml |  6 +++---
 parameters/cms.yaml     | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/parameters/cms-dev.yaml b/parameters/cms-dev.yaml
index b9d603c52..0a1b2f203 100644
--- a/parameters/cms-dev.yaml
+++ b/parameters/cms-dev.yaml
@@ -56,7 +56,7 @@ setup:
   weights:
   weights_config:
   lr: 1e-3
-  batch_size: 4
+  batch_size: 2
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 100
@@ -84,8 +84,8 @@ parameters:
   activation: gelu
   combined_graph_layer:
     do_lsh: yes
-    bin_size: 32
-    max_num_bins: 500
+    bin_size: 128
+    max_num_bins: 100
     distance_dim: 128
     layernorm: no
     dropout: 0.0
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index 818762c9c..fedd887cf 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -82,7 +82,7 @@ parameters:
     bin_size: 640
     max_num_bins: 100
     distance_dim: 128
-    layernorm: no
+    layernorm: yes
     dropout: 0.0
     dist_activation: linear
     kernel:
@@ -92,7 +92,7 @@ parameters:
     num_node_messages: 1
     node_message:
       type: GHConvDense
-      output_dim: 256
+      output_dim: 128
       activation: gelu
       normalize_degrees: yes
     hidden_dim: 128
@@ -115,7 +115,7 @@ parameters:
     phi_dim_decrease: yes
     energy_dim_decrease: yes
 
-    id_hidden_dim: 256
+    id_hidden_dim: 512
     charge_hidden_dim: 256
     pt_hidden_dim: 256
     eta_hidden_dim: 256
@@ -128,7 +128,7 @@ parameters:
     eta_num_layers: 2
     phi_num_layers: 2
     energy_num_layers: 2
-    layernorm: no
+    layernorm: yes
     mask_reg_cls0: no
 
   skip_connection: yes
@@ -139,6 +139,6 @@ timing:
   num_iter: 3
 
 exponentialdecay:
-  decay_steps: 1000
+  decay_steps: 2000
   decay_rate: 0.99
   staircase: yes
\ No newline at end of file

From 7b0f17dfd50c71715019a9cb6d5863f8dbbfb8a2 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 3 Sep 2021 12:10:36 +0200
Subject: [PATCH 152/157] feat: limit raytune pending trials SLURM_NNODES

---
 mlpf/flatiron/raytune.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
index e6abd033f..6dfbb71bb 100755
--- a/mlpf/flatiron/raytune.sh
+++ b/mlpf/flatiron/raytune.sh
@@ -20,6 +20,8 @@ echo "#################### Job submission script. #############################"
 cat $0
 echo "################# End of job submission script. #########################"
 
+export TUNE_RESULT_DIR="/mnt/ceph/users/ewulff/ray_results/tune_result_dir"
+export TUNE_MAX_PENDING_TRIALS_PG=${SLURM_NNODES}
 
 module purge
 module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1

From a7985ea8c8c6895b728bcc96aaa5970c1540d810 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 3 Sep 2021 13:55:30 +0300
Subject: [PATCH 153/157] add charge to clic pf candidate

---
 clic/dumper.py       |   6 +-
 notebooks/clic.ipynb | 192 ++++++++++++++++++++++++++++++-------------
 2 files changed, 139 insertions(+), 59 deletions(-)

diff --git a/clic/dumper.py b/clic/dumper.py
index 15307f354..6e1b523c4 100644
--- a/clic/dumper.py
+++ b/clic/dumper.py
@@ -61,7 +61,8 @@ def pfParticleToDict(par):
         "px": mom[0],
         "py": mom[1],
         "pz": mom[2],
-        "energy": par.getEnergy()
+        "energy": par.getEnergy(),
+        "charge": par.getCharge()
     }
     return vec
 
@@ -210,6 +211,7 @@ def caloHitToDict(par, calohit_to_cluster, genparticle_dict, calohit_recotosim):
         nPF=colPF.getNumberOfElements()
         nCl=colCl.getNumberOfElements()
         nTr=colTr.getNumberOfElements()
+        nHit=simTrackHits.getNumberOfElements()
         nHCB=colHCB.getNumberOfElements()
         nHCE=colHCE.getNumberOfElements()
         nECB=colECB.getNumberOfElements()
@@ -223,7 +225,7 @@ def caloHitToDict(par, calohit_to_cluster, genparticle_dict, calohit_recotosim):
             assert(not (recohit in calohit_recotosim))
             calohit_recotosim[recohit] = simhit
  
-        print "Event %d, nGen=%d, nPF=%d, nClusters=%d, nTracks=%d, nHCAL=%d, nECAL=%d" % (nEvent, nMc, nPF, nCl, nTr, nHCB+nHCE, nECB+nECE)
+        print "Event %d, nGen=%d, nPF=%d, nClusters=%d, nTracks=%d, nHCAL=%d, nECAL=%d, nHits=%d" % (nEvent, nMc, nPF, nCl, nTr, nHCB+nHCE, nECB+nECE, nHit)
     
         genparticles = []
         genparticle_dict = {}
diff --git a/notebooks/clic.ipynb b/notebooks/clic.ipynb
index 8d52cbf8a..3f79ffd8e 100644
--- a/notebooks/clic.ipynb
+++ b/notebooks/clic.ipynb
@@ -22,7 +22,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = json.load(bz2.BZ2File(\"/home/joosep/Downloads/pythia6_ttbar_0001_pandora.json.bz2\", \"r\"))"
+    "#data = json.load(bz2.BZ2File(\"/home/joosep/Downloads/pythia6_ttbar_0001_pandora.json.bz2\", \"r\"))\n",
+    "data = json.load(bz2.BZ2File(\"/home/joosep/particleflow/data/clic/gev380ee_pythia6_ttbar_rfull201/raw/pythia6_ttbar_0001_pandora_0.json.bz2\", \"r\"))"
    ]
   },
   {
@@ -55,7 +56,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "iev = 28\n",
+    "iev = 0\n",
     "df_gen = pandas.DataFrame(data[iev][\"genparticles\"])\n",
     "\n",
     "df_hit = pandas.DataFrame(data[iev][\"track_hits\"])\n",
@@ -71,6 +72,46 @@
     "df_tr[\"pz\"] = df_tr[\"tan_lambda\"]*df_tr[\"pt\"]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6cc1ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_hit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9089cfae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_ecal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2e01940",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_hcal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "efc9be54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_gen"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -150,6 +191,9 @@
     "        if filter_gp(gp):\n",
     "            dg.add_node((\"gp\", gp))\n",
     "            gps.add(gp)\n",
+    "            \n",
+    "            #the track is added to the genparticle with a very high weight\n",
+    "            #because we always want to associate the genparticle to a track if it's possible\n",
     "            dg.add_edge((\"gp\", gp), (\"tr\", itr), weight=9999.0)\n",
     "\n",
     "        \n",
@@ -157,22 +201,26 @@
     "gps = set(gps)\n",
     "\n",
     "#now loop over all the genparticles\n",
-    "#for each genparticle, find the neighboring reco elements (clusters and tracks)\n",
-    "#sort the neighbors by the edge weight (deposited energy)\n",
-    "#for each genparticle, choose the closest neighbor as the \"key\" reco element\n",
-    "#remove the reco element from the list\n",
     "pairs = {}\n",
     "for gp in gps:\n",
     "    gp_node = (\"gp\", gp)\n",
+    "\n",
+    "    #find the neighboring reco elements (clusters and tracks)\n",
     "    neighbors = list(dg.neighbors(gp_node))\n",
     "    weights = [dg.edges[gp_node, n][\"weight\"] for n in neighbors]\n",
     "    nw = zip(neighbors, weights)\n",
+    "    \n",
+    "    #sort the neighbors by the edge weight (deposited energy)\n",
     "    nw = sorted(nw, key=lambda x: x[1], reverse=True)\n",
     "    reco_obj = None\n",
     "    if len(nw)>0:\n",
+    "        #choose the closest neighbor as the \"key\" reco element\n",
     "        reco_obj = nw[0][0]\n",
-    "        dg.remove_node(reco_obj)\n",
     "        \n",
+    "        #remove the reco element from the list, so it can't be associated to anything else\n",
+    "        dg.remove_node(reco_obj)\n",
+    "    \n",
+    "    #this genparticle had a unique reco element\n",
     "    if reco_obj:\n",
     "        pf_obj = None\n",
     "        if reco_obj and reco_obj in reco_to_pf:\n",
@@ -180,8 +228,11 @@
     "\n",
     "        assert(not (reco_obj in pairs))\n",
     "        pairs[reco_obj] = (gp, pf_obj)\n",
+    "        \n",
+    "    #this is a case where a genparticle did not have a key reco element, but instead was smeared between others\n",
     "    else:\n",
-    "        print(\"genparticle {} is merged and cannot be reconstructed\".format(gp))"
+    "        print(\"genparticle {} is merged and cannot be reconstructed\".format(gp))\n",
+    "        print(df_gen.loc[gp])"
    ]
   },
   {
@@ -201,27 +252,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def track_as_array(itr):\n",
+    "def track_as_array(df_tr, itr):\n",
     "    row = df_tr.loc[itr]\n",
     "    return [0, row[\"px\"], row[\"py\"], row[\"pz\"], row[\"nhits\"], row[\"d0\"], row[\"z0\"]]\n",
     "\n",
-    "def cluster_as_array(icl):\n",
+    "def cluster_as_array(df_cl, icl):\n",
     "    row = df_cl.loc[icl]\n",
     "    return [1, row[\"x\"], row[\"y\"], row[\"z\"], row[\"nhits_ecal\"], row[\"nhits_hcal\"], 0.0]\n",
     "\n",
-    "def gen_as_array(igen):\n",
+    "def gen_as_array(df_gen, igen):\n",
     "    if igen:\n",
     "        row = df_gen.loc[igen]\n",
-    "        return np.array([row[\"pdgid\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
+    "        return np.array([abs(row[\"pdgid\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
     "    else:\n",
-    "        return np.zeros(5)\n",
+    "        return np.zeros(6)\n",
     "    \n",
-    "def pf_as_array(igen):\n",
+    "def pf_as_array(df_pfs, igen):\n",
     "    if igen:\n",
     "        row = df_pfs.loc[igen]\n",
-    "        return np.array([row[\"type\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
+    "        return np.array([abs(row[\"type\"]), row[\"charge\"], row[\"px\"], row[\"py\"], row[\"pz\"], row[\"energy\"]])\n",
     "    else:\n",
-    "        return np.zeros(5)"
+    "        return np.zeros(6)"
    ]
   },
   {
@@ -231,37 +282,42 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "Xs = []\n",
-    "ys_gen = []\n",
-    "ys_cand = []\n",
-    "for itr in range(len(df_tr)):\n",
-    "    Xs.append(track_as_array(itr))\n",
+    "def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs):\n",
+    "    Xs = []\n",
+    "    ys_gen = []\n",
+    "    ys_cand = []\n",
     "    \n",
-    "    k = (\"tr\", itr)\n",
-    "    gp = None\n",
-    "    rp = None\n",
-    "    if k in pairs:\n",
-    "        gp = pairs[k][0]\n",
-    "        rp = pairs[k][1]\n",
-    "    ys_gen.append(gen_as_array(gp))\n",
-    "    ys_cand.append(pf_as_array(rp))\n",
+    "    #find all track-associated particles\n",
+    "    for itr in range(len(df_tr)):\n",
+    "        Xs.append(track_as_array(df_tr, itr))\n",
     "\n",
+    "        k = (\"tr\", itr)\n",
+    "        gp = None\n",
+    "        rp = None\n",
+    "        if k in pairs:\n",
+    "            gp = pairs[k][0]\n",
+    "            rp = pairs[k][1]\n",
+    "        ys_gen.append(gen_as_array(df_gen, gp))\n",
+    "        ys_cand.append(pf_as_array(df_pfs, rp))\n",
     "    \n",
-    "for icl in range(len(df_cl)):\n",
-    "    Xs.append(cluster_as_array(icl))\n",
-    "    \n",
-    "    k = (\"cl\", icl)\n",
-    "    gp = None\n",
-    "    rp = None\n",
-    "    if k in pairs:\n",
-    "        gp = pairs[k][0]\n",
-    "        rp = pairs[k][1]\n",
-    "    ys_gen.append(gen_as_array(gp))\n",
-    "    ys_cand.append(pf_as_array(rp))\n",
+    "    #find all cluster-associated particles\n",
+    "    for icl in range(len(df_cl)):\n",
+    "        Xs.append(cluster_as_array(df_cl, icl))\n",
+    "\n",
+    "        k = (\"cl\", icl)\n",
+    "        gp = None\n",
+    "        rp = None\n",
+    "        if k in pairs:\n",
+    "            gp = pairs[k][0]\n",
+    "            rp = pairs[k][1]\n",
+    "        ys_gen.append(gen_as_array(df_gen, gp))\n",
+    "        ys_cand.append(pf_as_array(df_pfs, rp))\n",
+    "\n",
+    "    Xs = np.stack(Xs, axis=-1).T\n",
+    "    ys_gen = np.stack(ys_gen, axis=-1).T\n",
+    "    ys_cand = np.stack(ys_cand, axis=-1).T\n",
     "    \n",
-    "Xs = np.stack(Xs, axis=-1).T\n",
-    "ys_gen = np.stack(ys_gen, axis=-1).T\n",
-    "ys_cand = np.stack(ys_cand, axis=-1).T"
+    "    return Xs, ys_gen, ys_cand"
    ]
   },
   {
@@ -271,58 +327,80 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(Xs)\n",
-    "i = 106"
+    "Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs)\n",
+    "len(Xs), len(ys_gen), len(ys_cand)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "mexican-immune",
+   "id": "c022fce0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "Xs[i]"
+    "import sklearn\n",
+    "import sklearn.metrics"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fossil-cornell",
+   "id": "16dde9e2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_gen[i]"
+    "np.unique(ys_gen[:, 0])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "medium-armor",
+   "id": "012ef075",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_cand[i]"
+    "np.unique(ys_cand[:, 0])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "confident-publisher",
+   "id": "e9c5b8cd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_gen[:, 0]"
+    "labels = [0,   13,   11, 22,  130,  211,  321, 2112, 2212]\n",
+    "labels_text = {\n",
+    "    0: \"none\",\n",
+    "    13: \"mu\",\n",
+    "    11: \"el\",\n",
+    "    22: \"$\\gamma$\",\n",
+    "    130: \"$K^0_L$\",\n",
+    "    211: \"$\\pi^\\pm$\",\n",
+    "    321: \"$K^+$\",\n",
+    "    2112: \"n\",\n",
+    "    2212: \"p\"\n",
+    "}\n",
+    "cm = sklearn.metrics.confusion_matrix(\n",
+    "    ys_gen[:, 0],\n",
+    "    ys_cand[:, 0],\n",
+    "    labels=labels,\n",
+    "    normalize=\"true\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cardiovascular-majority",
+   "id": "8817f3e5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ys_cand[:, 0]"
+    "plt.imshow(cm, cmap=\"Blues\")\n",
+    "plt.xticks(range(len(labels)), [labels_text[l] for l in labels], rotation=90);\n",
+    "plt.yticks(range(len(labels)), [labels_text[l] for l in labels]);\n",
+    "plt.xlabel(\"reco\")\n",
+    "plt.ylabel(\"gen\")"
    ]
   },
   {
@@ -461,7 +539,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -475,7 +553,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,

From f2d75e0f602360caad2c0c9cd822005f379983d8 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 3 Sep 2021 13:56:46 +0300
Subject: [PATCH 154/157] added cms gen config

---
 mlpf/pipeline.py         |   7 +-
 notebooks/cms-mlpf.ipynb | 106 +++++++++++++++++++++++++++-
 parameters/cms-gen.yaml  | 144 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100644 parameters/cms-gen.yaml

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 5bddd912b..c7859d4cb 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -190,9 +190,10 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
         configure_model_weights(model, config["setup"]["trainable"])
         model(tf.cast(X_val[:1], model_dtype))
 
-        print("trainable weights")
-        for w in model.trainable_weights:
-            print(w.name)
+        print("model weights")
+        tw_names = [m.name for m in model.trainable_weights]
+        for w in model.weights:
+            print("layer={} trainable={} shape={} num_weights={}".format(w.name, w.name in tw_names, w.shape, np.prod(w.shape)))
 
         loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
diff --git a/notebooks/cms-mlpf.ipynb b/notebooks/cms-mlpf.ipynb
index bfdc07ec7..d2b76df1e 100644
--- a/notebooks/cms-mlpf.ipynb
+++ b/notebooks/cms-mlpf.ipynb
@@ -132,7 +132,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "path = \"../experiments/cms-dev_20210831_225815_541048.gpu0.local/evaluation/\""
+    "#path = \"../experiments/cms-dev_20210831_225815_541048.gpu0.local/evaluation/\"\n",
+    "path = \"../experiments/cms-gen_20210903_114315_805349.joosep-desktop-work/evaluation/\""
    ]
   },
   {
@@ -1016,6 +1017,109 @@
     "plt.xlabel(\"PFCandidate $\\log_{10}$ E/GeV\")\n",
     "plt.ylabel(\"MLPFCandidate $\\log_{10}$ E/GeV\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a3ab75a",
+   "metadata": {},
+   "source": [
+    "## Gen level"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "700c7700",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "for icls in range(1,8):\n",
+    "    npred = np.sum(ypred_id == icls, axis=1)\n",
+    "    ncand = np.sum(ycand[:, :, 0] == icls, axis=1)\n",
+    "    ngen = np.sum(ygen[:, :, 0] == icls, axis=1)\n",
+    "    plt.figure(figsize=(6,6))\n",
+    "    plt.scatter(ngen, ncand, marker=\".\", alpha=0.5, label=\"PF\")\n",
+    "    plt.scatter(ngen, npred, marker=\".\", alpha=0.5, label=\"MLPF\")\n",
+    "    plt.legend(loc=\"best\", frameon=False)\n",
+    "    a = 0.5*min(np.min(ngen), np.min(ngen))\n",
+    "    b = 2*max(np.max(ngen), np.max(ngen))\n",
+    "    plt.xlim(a,b)\n",
+    "    plt.ylim(a,b)\n",
+    "    plt.plot([a,b],[a,b], color=\"black\", ls=\"--\")\n",
+    "    plt.title(pid_names_long[icls],y=1.05)\n",
+    "    plt.xlabel(\"number of gen particles\")\n",
+    "    plt.ylabel(\"number of PFCandidates\")\n",
+    "    cms_label(x2=0.6, y=0.89)\n",
+    "#     plt.savefig(\"num_cls{}.pdf\".format(icls))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5661ff16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = np.linspace(0,500,100)\n",
+    "mplhep.histplot(np.histogram(ygen_f[ygen_f[:, 0]==2, 6], bins=bins))\n",
+    "mplhep.histplot(np.histogram(ycand_f[ycand_f[:, 0]==2, 6], bins=bins))\n",
+    "mplhep.histplot(np.histogram(ypred_f[ypred_f[:, 0]==2, 6], bins=bins))\n",
+    "plt.yscale(\"log\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82f29ef8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icls = 4\n",
+    "bins = np.linspace(-200,200,100)\n",
+    "particle_label = \"neutral hadrons\"\n",
+    "\n",
+    "msk_cand = (ygen_f[:, 0]==icls) & (ycand_f[:, 0]==icls)\n",
+    "msk_pred = (ygen_f[:, 0]==icls) & (ypred_f[:, 0]==icls)\n",
+    "\n",
+    "vals_gen1 = ygen_f[msk_cand, 6]\n",
+    "vals_gen2 = ygen_f[msk_pred, 6]\n",
+    "vals_cand = ycand_f[msk_cand, 6]\n",
+    "vals_pred = ypred_f[msk_pred, 6]\n",
+    "\n",
+    "res_cand = vals_gen1 - vals_cand\n",
+    "res_pred = vals_gen2 - vals_pred\n",
+    "\n",
+    "plt.figure(figsize=(5,5))\n",
+    "ax = plt.axes()\n",
+    "plt.hist(\n",
+    "    res_cand,\n",
+    "    bins=bins, histtype=\"step\", lw=2,\n",
+    "    label=\"PF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(res_cand), np.std(res_cand)));\n",
+    "\n",
+    "plt.hist(res_pred,\n",
+    "    bins=bins,\n",
+    "    histtype=\"step\", lw=2,\n",
+    "    label=\"MLPF, $\\mu={:.2f}, \\sigma={:.2f}$\".format(np.mean(res_pred), np.std(res_pred))\n",
+    ");\n",
+    "\n",
+    "plt.yscale(\"log\")\n",
+    "plt.ylabel(\"Number of particles / bin\")\n",
+    "cms_label(x1=0.21, x2=0.55)\n",
+    "plt.ylim(top=10**9)\n",
+    "plt.text(0.02, 0.95, particle_label, transform=ax.transAxes)\n",
+    "plt.xlabel(\"particle $E_{\\mathrm{gen}} - E_{\\mathrm{reco}}$ [GeV]\")\n",
+    "plt.legend(frameon=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "338f50e9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
new file mode 100644
index 000000000..f7df7746b
--- /dev/null
+++ b/parameters/cms-gen.yaml
@@ -0,0 +1,144 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: gen
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.01
+  pt_loss_coef: 0.0001
+  eta_loss_coef: 100.0
+  sin_phi_loss_coef: 10.0
+  cos_phi_loss_coef: 10.0
+  energy_loss_coef: 0.0001
+  raw_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl*
+  processed_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_gen/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl*
+  energy_loss:
+    type: Huber
+  pt_loss:
+    type: Huber
+  sin_phi_loss:
+    type: Huber
+    delta: 0.1
+  cos_phi_loss:
+    type: Huber
+    delta: 0.1
+  eta_loss:
+    type: Huber
+    delta: 0.1
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config:
+  lr: 1e-3
+  batch_size: 4
+  num_events_train: 80000
+  num_events_test: 10000
+  num_epochs: 50
+  num_val_files: 10
+  dtype: float32
+  trainable: classification
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
+
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
+parameters:
+  model: gnn_dense
+  input_encoding: cms
+  combined_graph_layer:
+    bin_size: 640
+    max_num_bins: 100
+    distance_dim: 64
+    layernorm: no
+    dropout: 0.0
+    dist_activation: linear
+    kernel:
+      type: NodePairGaussianKernel
+      dist_mult: 0.1
+      clip_value_low: 0.0
+    num_node_messages: 2
+    node_message:
+      type: GHConvDense
+      output_dim: 512
+      activation: gelu
+      normalize_degrees: yes
+    hidden_dim: 512
+    activation: gelu
+  num_graph_layers_common: 2
+  num_graph_layers_energy: 2
+  output_decoding:
+    activation: gelu
+    regression_use_classification: yes
+    dropout: 0.0
+
+    pt_skip_gate: no
+    eta_skip_gate: yes
+    phi_skip_gate: yes
+
+    id_dim_decrease: yes
+    charge_dim_decrease: yes
+    pt_dim_decrease: yes
+    eta_dim_decrease: yes
+    phi_dim_decrease: yes
+    energy_dim_decrease: yes
+
+    id_hidden_dim: 512
+    charge_hidden_dim: 256
+    pt_hidden_dim: 256
+    eta_hidden_dim: 256
+    phi_hidden_dim: 256
+    energy_hidden_dim: 256
+
+    id_num_layers: 3
+    charge_num_layers: 2
+    pt_num_layers: 2
+    eta_num_layers: 2
+    phi_num_layers: 2
+    energy_num_layers: 2
+    layernorm: no
+    mask_reg_cls0: no
+
+  skip_connection: yes
+  debug: no
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+exponentialdecay:
+  decay_steps: 2000
+  decay_rate: 0.99
+  staircase: yes

From 1519195be4e246d939f72e2b582faa786e6bc12b Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 3 Sep 2021 16:00:58 +0300
Subject: [PATCH 155/157] fix bin size

---
 scripts/test_load_tfmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test_load_tfmodel.py b/scripts/test_load_tfmodel.py
index 76020bc3d..2e30b6346 100644
--- a/scripts/test_load_tfmodel.py
+++ b/scripts/test_load_tfmodel.py
@@ -2,7 +2,7 @@
 import sys
 import numpy as np
 
-bin_size = 160
+bin_size = 640
 num_features = 15
 
 def load_graph(frozen_graph_filename):

From 47ae9a46aef6d16e4b6619b6bc4a859967e6d5ec Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 3 Sep 2021 16:01:03 +0300
Subject: [PATCH 156/157] fixes for gun sample training

---
 mlpf/pipeline.py            | 29 +++++++++++++++++++----------
 mlpf/tfmodel/model.py       |  5 ++---
 mlpf/tfmodel/model_setup.py |  1 -
 parameters/delphes.yaml     |  1 +
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 348666f71..2daa8d642 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -149,6 +149,11 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
         prefix += customize + "_"
         config = customization_functions[customize](config)
 
+    if recreate or (weights is None):
+        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
+    else:
+        outdir = str(Path(weights).parent)
+
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if "CPU" not in strategy.extended.worker_devices[0]:
@@ -166,10 +171,6 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
 
     X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
-    if recreate or (weights is None):
-        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
-    else:
-        outdir = str(Path(weights).parent)
     if experiment:
         experiment.set_name(outdir)
         experiment.log_code("mlpf/tfmodel/model.py")
@@ -196,7 +197,11 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
 
         # Run model once to build the layers
         print(X_val.shape)
-        model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
+        
+        if config["tensorflow"]["eager"]:
+            model(X_val[:1])
+        else:
+            model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
 
         initial_epoch = 0
         if weights:
@@ -402,16 +407,19 @@ def find_lr(config, outdir, figname, logscale):
 
 
 def customize_gun_sample(config):
-    config["dataset"]["padded_num_elem_size"] = 640
+
+    #FIXME: must be at least 2x bin_size
+    config["dataset"]["padded_num_elem_size"] = 1280
+
     config["dataset"]["processed_path"] = "data/SinglePiFlatPt0p7To10_cfi/tfr_cand/*.tfrecords"
-    config["dataset"]["raw_path"] = "data/SinglePiFlatPt0p7To10_cfi/raw/*.pkl.bz2"
+    config["dataset"]["raw_path"] = "data/SinglePiFlatPt0p7To10_cfi/raw/*.pkl*"
     config["dataset"]["classification_loss_coef"] = 0.0
     config["dataset"]["charge_loss_coef"] = 0.0
     config["dataset"]["eta_loss_coef"] = 0.0
     config["dataset"]["sin_phi_loss_coef"] = 0.0
     config["dataset"]["cos_phi_loss_coef"] = 0.0
-    config["setup"]["trainable"] = "ffn_energy"
-    config["setup"]["batch_size"] = 10*config["setup"]["batch_size"]
+    config["setup"]["trainable"] = "regression"
+    config["setup"]["batch_size"] = 20*config["setup"]["batch_size"]
     return config
 
 customization_functions = {
@@ -472,6 +480,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         config["dataset"]["num_output_classes"],
         dataset_def,
     )
+
     callbacks.append(optim_callbacks)
     callbacks.append(tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss'))
 
@@ -487,7 +496,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
         callbacks=callbacks,
     )
-    print("Hyperparamter search complete.")
+    print("Hyperparameter search complete.")
     shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
     tuner.results_summary()
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 609952079..7bf0db3ae 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -37,8 +37,6 @@ def pairwise_gaussian_dist(A, B):
 def pairwise_learnable_dist(A, B, ffn, training=False):
     shp = tf.shape(A)
 
-    # tf.print("shp", shp)
-    # import pdb;pdb.set_trace()
     #stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
     mg = tf.meshgrid(tf.range(shp[0]), tf.range(shp[1]), tf.range(shp[2]), tf.range(shp[2]), indexing="ij")
     inds1 = tf.stack([mg[0],mg[1],mg[2]], axis=-1)
@@ -384,6 +382,7 @@ def call(self, x_msg, x_node, msk, training=False):
         n_bins = tf.math.floordiv(n_points, self.bin_size)
 
         #put each input item into a bin defined by the argmax output across the LSH embedding
+        #FIXME: this needs n_bins to be at least 2 to work correctly!
         mul = tf.linalg.matmul(x_msg, self.codebook_random_rotations[:, :n_bins//2])
         cmul = tf.concat([mul, -mul], axis=-1)
         bins_split = split_indices_to_bins_batch(cmul, n_bins, self.bin_size, msk)
@@ -661,7 +660,7 @@ def __init__(self, *args, **kwargs):
         self.hidden_dim = kwargs.pop("hidden_dim")
         self.do_lsh = kwargs.pop("do_lsh", True)
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
-        self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation"))
+        self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation", "linear"))
 
         if self.do_layernorm:
             self.layernorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6, name=kwargs.get("name")+"_layernorm")
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index b818406d4..5383587be 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -662,7 +662,6 @@ def configure_model_weights(model, trainable_layers):
             cg.trainable = False
         for cg in model.cg_energy:
             cg.trainable = True
-
         model.output_dec.set_trainable_regression()
     elif trainable_layers == "classification":
         for cg in model.cg:
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 902cff14b..6d46b54f0 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -98,6 +98,7 @@ parameters:
     layernorm: no
     num_node_messages: 1
     dropout: 0.0
+    dist_activation: linear
     kernel:
       type: NodePairGaussianKernel
       dist_mult: 0.1

From cf7d9743df6228b65b571747ecd5b667d0fda7af Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 3 Sep 2021 16:06:00 +0300
Subject: [PATCH 157/157] make sure batch size is propagated

---
 mlpf/pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 2daa8d642..0c80dff1b 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -148,6 +148,8 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
     if customize:
         prefix += customize + "_"
         config = customization_functions[customize](config)
+        #FIXME: refactor this
+        global_batch_size = config["setup"]["batch_size"]
 
     if recreate or (weights is None):
         outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
@@ -419,7 +421,7 @@ def customize_gun_sample(config):
     config["dataset"]["sin_phi_loss_coef"] = 0.0
     config["dataset"]["cos_phi_loss_coef"] = 0.0
     config["setup"]["trainable"] = "regression"
-    config["setup"]["batch_size"] = 20*config["setup"]["batch_size"]
+    config["setup"]["batch_size"] = 10*config["setup"]["batch_size"]
     return config
 
 customization_functions = {