From b1cb537b4e89b82048c73a42c750b4c6f4ae1990 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 24 Aug 2022 18:41:25 +0300
Subject: [PATCH] Faster test, pre-commit formatting, general cleanup (#129)

* add pre commit hooks

* add reqs

* fix configs

* cleanup

* install from reqs file
---
 .github/workflows/pre-commit.yml              |  27 +
 .github/workflows/test.yml                    |  22 +-
 .gitignore                                    |   4 +
 .pre-commit-config.yaml                       |  40 ++
 mlpf/pipeline.py                              | 464 +++++++-------
 mlpf/tfmodel/data.py                          |   7 -
 mlpf/tfmodel/datasets/BaseDatasetFactory.py   |  56 +-
 mlpf/tfmodel/datasets/CMSDatasetFactory.py    |   6 +-
 .../tfmodel/datasets/DelphesDatasetFactory.py |   6 +-
 mlpf/tfmodel/delphes_data.py                  | 179 ------
 mlpf/tfmodel/fast_attention.py                | 486 ---------------
 mlpf/tfmodel/fast_attention_util.py           | 195 ------
 mlpf/tfmodel/model.py                         | 590 ++++++++++--------
 mlpf/tfmodel/model_setup.py                   | 350 +++++------
 mlpf/tfmodel/mpnn.py                          | 291 ---------
 mlpf/tfmodel/opt.py                           |  91 ---
 mlpf/tfmodel/pred_tf_model.py                 | 156 -----
 mlpf/tfmodel/tf_data.py                       | 128 ----
 mlpf/tfmodel/utils.py                         | 292 ++++-----
 parameters/cms-gen.yaml                       |   8 +-
 parameters/cms.yaml                           |   8 +-
 parameters/delphes.yaml                       |  13 +-
 requirements.txt                              |  35 ++
 scripts/local_test_cms_pipeline.sh            |   2 +-
 scripts/local_test_delphes_pipeline.sh        |   2 +-
 25 files changed, 964 insertions(+), 2494 deletions(-)
 create mode 100644 .github/workflows/pre-commit.yml
 create mode 100644 .pre-commit-config.yaml
 delete mode 100644 mlpf/tfmodel/data.py
 delete mode 100644 mlpf/tfmodel/delphes_data.py
 delete mode 100644 mlpf/tfmodel/fast_attention.py
 delete mode 100644 mlpf/tfmodel/fast_attention_util.py
 delete mode 100644 mlpf/tfmodel/mpnn.py
 delete mode 100644 mlpf/tfmodel/opt.py
 delete mode 100644 mlpf/tfmodel/pred_tf_model.py
 delete mode 100644 mlpf/tfmodel/tf_data.py
 create mode 100644 requirements.txt

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 000000000..b10f1c781
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,27 @@
+name: Run pre-commit
+
+on:
+  pull_request:
+    branches: [ main ]
+  push:
+    branches: [ main ]
+
+jobs:
+  lint:
+    name: Lint PR or Push to main
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.9]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Run Lint
+        uses: pre-commit/action@v2.0.0
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 96b3d24c5..65210ca9d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,15 +20,8 @@ jobs:
           python-version: '3.9'
       - name: Install python deps
         run: |
-          pip install tensorflow==2.9 setGPU \
-            sklearn matplotlib boost_histogram mplhep pandas scipy uproot \
-            awkward vector pyarrow fastjet keras-tuner networkx \
-            tensorflow-probability tensorflow-addons \
-            tqdm click tensorflow-datasets 'ray[default]'==1.6.0 'ray[tune]==1.6.0' \
-            tf-models-official tensorflow-text \
-            tf2onnx onnxruntime zenodo_get seaborn scikit-optimize nevergrad \
-            tensorflow-estimator keras \
-            notebook papermill ./hep_tfds
+          pip install -r requirements.txt
+          pip install ./hep_tfds
           HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras]
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_pipeline.sh
@@ -44,15 +37,8 @@ jobs:
           python-version: '3.9'
       - name: Install python deps
         run: |
-          pip install tensorflow==2.9 setGPU \
-            sklearn matplotlib boost_histogram mplhep pandas scipy uproot \
-            awkward vector pyarrow fastjet keras-tuner networkx \
-            tensorflow-probability tensorflow-addons \
-            tqdm click tensorflow-datasets 'ray[default]'==1.6.0 'ray[tune]==1.6.0' \
-            tf-models-official tensorflow-text \
-            tf2onnx onnxruntime zenodo_get seaborn scikit-optimize nevergrad \
-            tensorflow-estimator keras \
-            notebook papermill ./hep_tfds
+          pip install -r requirements.txt
+          pip install ./hep_tfds
           HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras]
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
diff --git a/.gitignore b/.gitignore
index a378a4529..6f720c1de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.npz
 *.pt
 *.pdf
+*.png
 data/*
 experiments/*
 prp/*
@@ -19,3 +20,6 @@ test/__pycache__/
 
 *playground.py
 nohup.out
+
+*.pkl
+*.pkl.bz2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..f989e4e10
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+default_language_version:
+    python: python3.9
+
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.3.0
+  hooks:
+  - id: trailing-whitespace
+  - id: check-added-large-files
+  - id: check-ast
+  - id: check-json
+  - id: check-merge-conflict
+  - id: check-xml
+  - id: check-yaml
+  - id: debug-statements
+  - id: end-of-file-fixer
+  - id: requirements-txt-fixer
+  - id: mixed-line-ending
+    args: ['--fix=no']
+
+- repo: https://github.com/PyCQA/isort
+  rev: 5.10.1
+  hooks:
+  - id: isort
+    args: ['--profile', 'black', '--filter-files']
+
+- repo: https://github.com/psf/black
+  rev: 22.6.0
+  hooks:
+  - id: black-jupyter
+    language_version: python3
+    args: [--line-length=125]
+
+- repo: https://gitlab.com/pycqa/flake8
+  rev: 3.9.2
+  hooks:
+  - id: flake8
+    # black-compatible flake-8 config
+    args: ['--max-line-length=125',  # github viewer width
+           '--extend-ignore=E203,W605']  # E203 is not PEP8 compliant
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index fe75e32e8..5c202b4d4 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -1,87 +1,60 @@
-try:
-    import comet_ml
-except ModuleNotFoundError as e:
-    print("comet_ml not found, ignoring")
-
 try:
     import horovod.tensorflow.keras as hvd
 except ModuleNotFoundError:
     print("hvd not enabled, ignoring")
 
-import sys
-import os
-import yaml
 import json
-from datetime import datetime
-import glob
-import random
+import logging
+import os
+import pickle
 import platform
-import numpy as np
-from pathlib import Path
-import click
-from tqdm import tqdm
+import random
 import shutil
+from datetime import datetime
 from functools import partial
-import shlex
-import subprocess
-import matplotlib.pyplot as plt
-import logging
-import pickle
+from pathlib import Path
 
+import click
+import numpy as np
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
-import tensorflow_addons as tfa
-import keras
-
-
-from tfmodel.data import Dataset
-from tfmodel.datasets import CMSDatasetFactory, DelphesDatasetFactory
+from tfmodel import hypertuning
+from tfmodel.lr_finder import LRFinder
 from tfmodel.model_setup import (
-    make_model,
-    configure_model_weights,
-    LearningRateLoggingCallback,
-    prepare_callbacks,
     FlattenedCategoricalAccuracy,
     SingleClassRecall,
+    configure_model_weights,
     eval_model,
     freeze_model,
+    make_model,
+    prepare_callbacks,
 )
-
 from tfmodel.utils import (
+    create_experiment_dir,
+    delete_all_but_best_checkpoint,
+    get_best_checkpoint,
+    get_datasets,
+    get_heptfds_dataset,
+    get_loss_dict,
     get_lr_schedule,
     get_optimizer,
-    create_experiment_dir,
     get_strategy,
-    make_weight_function,
+    get_tuner,
     load_config,
-    compute_weights_invsqrt,
-    compute_weights_none,
-    get_train_val_datasets,
-    get_dataset_def,
-    set_config_loss,
-    get_loss_dict,
     parse_config,
-    get_best_checkpoint,
-    delete_all_but_best_checkpoint,
-    get_tuner,
-    get_heptfds_dataset,
-    get_datasets,
+    set_config_loss,
 )
-
-from tfmodel.lr_finder import LRFinder
-from tfmodel import hypertuning
 from tfmodel.utils_analysis import (
-    plot_ray_analysis,
     analyze_ray_experiment,
-    topk_summary_plot_v2,
-    summarize_top_k,
     count_skipped_configurations,
+    plot_ray_analysis,
+    summarize_top_k,
+    topk_summary_plot_v2,
 )
 
 
-
 def customize_pipeline_test(config):
-    #for cms.yaml, keep only ttbar
+    # for cms.yaml, keep only ttbar
     if "physical" in config["train_test_datasets"]:
         config["train_test_datasets"]["physical"]["datasets"] = ["cms_pf_ttbar"]
         config["train_test_datasets"] = {"physical": config["train_test_datasets"]["physical"]}
@@ -90,15 +63,16 @@ def customize_pipeline_test(config):
 
     return config
 
-customization_functions = {
-    "pipeline_test": customize_pipeline_test
-}
+
+customization_functions = {"pipeline_test": customize_pipeline_test}
+
 
 @click.group()
 @click.help_option("-h", "--help")
 def main():
     pass
 
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
@@ -113,25 +87,35 @@ def main():
 @click.option("--comet-offline", help="log comet-ml experiment locally", is_flag=True)
 def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq, customize, comet_offline):
 
-    #tf.debugging.enable_check_numerics()
+    # tf.debugging.enable_check_numerics()
 
     """Train a model defined by config"""
     config_file_path = config
-    config, config_file_stem = parse_config(
-        config, nepochs=nepochs, weights=weights
-    )
+    config, config_file_stem = parse_config(config, nepochs=nepochs, weights=weights)
 
     if plot_freq:
         config["callbacks"]["plot_freq"] = plot_freq
 
     if customize:
         config = customization_functions[customize](config)
-    
+
+    # Decide tf.distribute.strategy depending on number of available GPUs
+    horovod_enabled = config["setup"]["horovod_enabled"]
+    if horovod_enabled:
+        num_gpus = initialize_horovod()
+    else:
+        strategy, num_gpus = get_strategy()
+
+    outdir = ""
+    if not horovod_enabled or hvd.rank() == 0:
+        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
+        shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
     try:
         if comet_offline:
             print("Using comet-ml OfflineExperiment, saving logs locally.")
             from comet_ml import OfflineExperiment
+
             experiment = OfflineExperiment(
                 project_name="particleflow-tf",
                 auto_metric_logging=True,
@@ -144,7 +128,7 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
         else:
             print("Using comet-ml Experiment, streaming logs to www.comet.ml.")
             from comet_ml import Experiment
-            offline_dir = None
+
             experiment = Experiment(
                 project_name="particleflow-tf",
                 auto_metric_logging=True,
@@ -154,31 +138,26 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
                 auto_histogram_activation_logging=False,
             )
     except Exception as e:
-        print("Failed to initialize comet-ml dashboard")
+        print("Failed to initialize comet-ml dashboard: {}".format(e))
         experiment = None
-
-    # Decide tf.distribute.strategy depending on number of available GPUs
-    horovod_enabled = config["setup"]["horovod_enabled"]
-    if horovod_enabled:
-        num_gpus = initialize_horovod()
-    else:
-        strategy, num_gpus = get_strategy()
-    outdir = ''
-    if not horovod_enabled or hvd.rank() == 0:
-        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
-        if experiment:
-            experiment.set_name(outdir)
-            experiment.log_code("mlpf/tfmodel/model.py")
-            experiment.log_code("mlpf/tfmodel/utils.py")
-            experiment.log_code(config_file_path)
-        
-        shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
+    if experiment:
+        experiment.set_name(outdir)
+        experiment.log_code("mlpf/tfmodel/model.py")
+        experiment.log_code("mlpf/tfmodel/utils.py")
+        experiment.log_code(config_file_path)
 
     ds_train, num_train_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "train")
     ds_test, num_test_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "test")
-    ds_val, ds_info = get_heptfds_dataset(config["validation_datasets"][0], config, num_gpus, "test", config["setup"]["num_events_validation"], supervised=False)
+    ds_val, ds_info = get_heptfds_dataset(
+        config["validation_datasets"][0],
+        config,
+        num_gpus,
+        "test",
+        config["setup"]["num_events_validation"],
+        supervised=False,
+    )
     ds_val = ds_val.batch(5)
-    
+
     if ntrain:
         ds_train = ds_train.take(ntrain)
         num_train_steps = ntrain
@@ -191,23 +170,18 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
     total_steps = num_train_steps * config["setup"]["num_epochs"]
     print("total_steps", total_steps)
 
-    
-    if horovod_enabled :
-        model,optim_callbacks,initial_epoch = model_scope(config, total_steps, weights, horovod_enabled)
+    if horovod_enabled:
+        model, optim_callbacks, initial_epoch = model_scope(config, total_steps, weights, horovod_enabled)
     else:
         with strategy.scope():
-            model,optim_callbacks,initial_epoch = model_scope(config, total_steps, weights)
+            model, optim_callbacks, initial_epoch = model_scope(config, total_steps, weights)
 
     callbacks = prepare_callbacks(
-        config,
-        outdir,
-        ds_val,
-        comet_experiment=experiment,
-        horovod_enabled=config["setup"]["horovod_enabled"]
+        config, outdir, ds_val, comet_experiment=experiment, horovod_enabled=config["setup"]["horovod_enabled"]
     )
 
     verbose = 1
-    if horovod_enabled: 
+    if horovod_enabled:
         callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
         callbacks.append(hvd.callbacks.MetricAverageCallback())
         verbose = 1 if hvd.rank() == 0 else 0
@@ -215,11 +189,9 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
         num_train_steps /= hvd.size()
         num_test_steps /= hvd.size()
 
-
     callbacks.append(optim_callbacks)
 
-    
-    fit_result = model.fit(
+    model.fit(
         ds_train.repeat(),
         validation_data=ds_test.repeat(),
         epochs=initial_epoch + config["setup"]["num_epochs"],
@@ -227,12 +199,13 @@ def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
         steps_per_epoch=num_train_steps,
         validation_steps=num_test_steps,
         initial_epoch=initial_epoch,
-        verbose=verbose
+        verbose=verbose,
     )
 
     # if not horovod_enabled or hvd.rank()==0:
     #     model_save(outdir, fit_result, model, weights)
 
+
 def model_save(outdir, fit_result, model, weights):
     history_path = Path(outdir) / "history"
     history_path = str(history_path)
@@ -243,13 +216,13 @@ def model_save(outdir, fit_result, model, weights):
     print("Loading best weights that could be found from {}".format(weights))
     model.load_weights(weights, by_name=True)
 
-    #model.save(outdir + "/model_full", save_format="tf")
+    # model.save(outdir + "/model_full", save_format="tf")
     print("Training done.")
 
+
 def model_scope(config, total_steps, weights, horovod_enabled=False):
     lr_schedule, optim_callbacks, lr = get_lr_schedule(config, steps=total_steps)
     opt = get_optimizer(config, lr_schedule)
-    
 
     if config["setup"]["dtype"] == "float16":
         model_dtype = tf.dtypes.float16
@@ -266,7 +239,7 @@ def model_scope(config, total_steps, weights, horovod_enabled=False):
 
     initial_epoch = 0
     loaded_opt = None
-    
+
     if weights:
         if lr_schedule:
             raise Exception("Restoring the optimizer state with a learning rate schedule is currently not supported")
@@ -301,19 +274,19 @@ def model_scope(config, total_steps, weights, horovod_enabled=False):
             "cls": [
                 FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
                 FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
-            ] + [
-                SingleClassRecall(
-                    icls,
-                    name="rec_cls{}".format(icls),
-                    dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"])
+            ]
+            + [
+                SingleClassRecall(icls, name="rec_cls{}".format(icls), dtype=tf.float64)
+                for icls in range(config["dataset"]["num_output_classes"])
             ]
         },
     )
 
     model.summary()
 
-    #Set the optimizer weights
+    # Set the optimizer weights
     if loaded_opt:
+
         def model_weight_setting():
             grad_vars = model.trainable_weights
             zero_grads = [tf.zeros_like(w) for w in grad_vars]
@@ -322,26 +295,25 @@ def model_weight_setting():
                 model.optimizer.optimizer.optimizer.set_weights(loaded_opt["weights"])
             else:
                 model.optimizer.set_weights(loaded_opt["weights"])
-        try:
-            strategy.run(model_weight_setting)
-        except Exception as e:
-            print(e)
 
-    return model,optim_callbacks,initial_epoch
+        # FIXME: check that this still works with multiple GPUs
+        strategy = tf.distribute.get_strategy()
+        strategy.run(model_weight_setting)
+
+    return model, optim_callbacks, initial_epoch
+
 
 def initialize_horovod():
     hvd.init()
-    gpus = tf.config.experimental.list_physical_devices('GPU')
+    gpus = tf.config.experimental.list_physical_devices("GPU")
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
     if gpus:
-        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
 
     return hvd.size()
 
 
-
-
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-t", "--train_dir", required=True, help="directory containing a completed training", type=click.Path())
@@ -358,7 +330,6 @@ def compute_validation_loss(config, train_dir, weights):
         model_dtype = tf.dtypes.float16
         policy = mixed_precision.Policy("mixed_float16")
         mixed_precision.set_global_policy(policy)
-        opt = mixed_precision.LossScaleOptimizer(opt)
     else:
         model_dtype = tf.dtypes.float32
 
@@ -387,11 +358,10 @@ def compute_validation_loss(config, train_dir, weights):
                 "cls": [
                     FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
                     FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
-                ] + [
-                    SingleClassRecall(
-                        icls,
-                        name="rec_cls{}".format(icls),
-                        dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"])
+                ]
+                + [
+                    SingleClassRecall(icls, name="rec_cls{}".format(icls), dtype=tf.float64)
+                    for icls in range(config["dataset"]["num_output_classes"])
                 ]
             },
         )
@@ -404,6 +374,7 @@ def compute_validation_loss(config, train_dir, weights):
     with open("{}/losses.txt".format(train_dir), "w") as loss_file:
         loss_file.write(json.dumps(losses) + "\n")
 
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-t", "--train_dir", required=True, help="directory containing a completed training", type=click.Path())
@@ -417,7 +388,7 @@ def evaluate(config, train_dir, weights, customize, nevents):
         config = Path(train_dir) / "config.yaml"
         assert config.exists(), "Could not find config file in train_dir, please provide one with -c <path/to/config>"
     config, _ = parse_config(config, weights=weights)
-    
+
     if customize:
         config = customization_functions[customize](config)
 
@@ -425,18 +396,17 @@ def evaluate(config, train_dir, weights, customize, nevents):
         model_dtype = tf.dtypes.float16
         policy = mixed_precision.Policy("mixed_float16")
         mixed_precision.set_global_policy(policy)
-        opt = mixed_precision.LossScaleOptimizer(opt)
     else:
         model_dtype = tf.dtypes.float32
 
     strategy, num_gpus = get_strategy()
-    #physical_devices = tf.config.list_physical_devices('GPU')
-    #for dev in physical_devices:
+    # physical_devices = tf.config.list_physical_devices('GPU')
+    # for dev in physical_devices:
     #    tf.config.experimental.set_memory_growth(dev, True)
 
     model = make_model(config, model_dtype)
     model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]))
-    
+
     # need to load the weights in the same trainable configuration as the model was set up
     configure_model_weights(model, config["setup"].get("weights_config", "all"))
     if weights:
@@ -459,6 +429,7 @@ def evaluate(config, train_dir, weights, customize, nevents):
 
     freeze_model(model, config, train_dir)
 
+
 @main.command()
 @click.help_option("-h", "--help")
 @click.option("-c", "--config", help="configuration file", type=click.Path())
@@ -547,10 +518,19 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
         config["setup"]["num_epochs"] = config["hypertune"]["hyperband"]["max_epochs"]
 
     strategy, num_gpus = get_strategy()
- 
-    ds_train, ds_info = get_heptfds_dataset(config["training_dataset"], config, num_gpus, "train", config["setup"]["num_events_train"])
+
+    ds_train, ds_info = get_heptfds_dataset(
+        config["training_dataset"], config, num_gpus, "train", config["setup"]["num_events_train"]
+    )
     ds_test, _ = get_heptfds_dataset(config["testing_dataset"], config, num_gpus, "test", config["setup"]["num_events_test"])
-    ds_val, _ = get_heptfds_dataset(config["validation_datasets"][0], config, num_gpus, "test", config["setup"]["num_events_validation"], supervised=False)
+    ds_val, _ = get_heptfds_dataset(
+        config["validation_datasets"][0],
+        config,
+        num_gpus,
+        "test",
+        config["setup"]["num_events_validation"],
+        supervised=False,
+    )
     ds_val = ds_val.batch(5)
 
     num_train_steps = 0
@@ -569,7 +549,7 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
     )
 
     callbacks.append(optim_callbacks)
-    callbacks.append(tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss'))
+    callbacks.append(tf.keras.callbacks.EarlyStopping(patience=20, monitor="val_loss"))
 
     tuner = get_tuner(config["hypertune"], model_builder, outdir, recreate, strategy)
     tuner.search_space_summary()
@@ -591,80 +571,89 @@ def hypertune(config, outdir, ntrain, ntest, recreate):
 
 
 def build_model_and_train(config, checkpoint_dir=None, full_config=None, ntrain=None, ntest=None, name=None, seeds=False):
-        from ray import tune
-        from raytune.search_space import set_raytune_search_parameters
-        from ray.tune.integration.keras import TuneReportCheckpointCallback
-        if seeds:
-            # Set seeds for reproducibility
-            random.seed(1234)
-            np.random.seed(1234)
-            tf.random.set_seed(1234)
+    from ray import tune
+    from ray.tune.integration.keras import TuneReportCheckpointCallback
+    from raytune.search_space import set_raytune_search_parameters
 
-        full_config, config_file_stem = parse_config(full_config)
+    if seeds:
+        # Set seeds for reproducibility
+        random.seed(1234)
+        np.random.seed(1234)
+        tf.random.set_seed(1234)
 
-        if config is not None:
-            full_config = set_raytune_search_parameters(search_space=config, config=full_config)
+    full_config, config_file_stem = parse_config(full_config)
 
-        strategy, num_gpus = get_strategy()
+    if config is not None:
+        full_config = set_raytune_search_parameters(search_space=config, config=full_config)
 
-        ds_train, num_train_steps = get_datasets(full_config["train_test_datasets"], full_config, num_gpus, "train")
-        ds_test, num_test_steps = get_datasets(full_config["train_test_datasets"], full_config, num_gpus, "test")
-        ds_val, ds_info = get_heptfds_dataset(full_config["validation_datasets"][0], full_config, num_gpus, "test", full_config["setup"]["num_events_validation"], supervised=False)
-        ds_val = ds_val.batch(5)
-
-        if ntrain:
-            ds_train = ds_train.take(ntrain)
-            num_train_steps = ntrain
-        if ntest:
-            ds_test = ds_test.take(ntest)
-            num_test_steps = ntest
-
-        print("num_train_steps", num_train_steps)
-        print("num_test_steps", num_test_steps)
-        total_steps = num_train_steps * full_config["setup"]["num_epochs"]
-        print("total_steps", total_steps)
-
-        callbacks = prepare_callbacks(
-            full_config,
-            tune.get_trial_dir(),
-            ds_val,
-        )
+    strategy, num_gpus = get_strategy()
 
-        callbacks = callbacks[:-1]  # remove the CustomCallback at the end of the list
+    ds_train, num_train_steps = get_datasets(full_config["train_test_datasets"], full_config, num_gpus, "train")
+    ds_test, num_test_steps = get_datasets(full_config["train_test_datasets"], full_config, num_gpus, "test")
+    ds_val, ds_info = get_heptfds_dataset(
+        full_config["validation_datasets"][0],
+        full_config,
+        num_gpus,
+        "test",
+        full_config["setup"]["num_events_validation"],
+        supervised=False,
+    )
+    ds_val = ds_val.batch(5)
 
-        with strategy.scope():
-            lr_schedule, optim_callbacks = get_lr_schedule(full_config, steps=total_steps)
-            callbacks.append(optim_callbacks)
-            opt = get_optimizer(full_config, lr_schedule)
-
-            model = make_model(full_config, dtype=tf.dtypes.float32)
-
-            # Run model once to build the layers
-            model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
-
-            full_config = set_config_loss(full_config, full_config["setup"]["trainable"])
-            configure_model_weights(model, full_config["setup"]["trainable"])
-            model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
-
-            loss_dict, loss_weights = get_loss_dict(full_config)
-            model.compile(
-                loss=loss_dict,
-                optimizer=opt,
-                sample_weight_mode="temporal",
-                loss_weights=loss_weights,
-                metrics={
-                    "cls": [
-                        FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
-                        FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
-                    ]
-                },
-            )
-            model.summary()
+    if ntrain:
+        ds_train = ds_train.take(ntrain)
+        num_train_steps = ntrain
+    if ntest:
+        ds_test = ds_test.take(ntest)
+        num_test_steps = ntest
+
+    print("num_train_steps", num_train_steps)
+    print("num_test_steps", num_test_steps)
+    total_steps = num_train_steps * full_config["setup"]["num_epochs"]
+    print("total_steps", total_steps)
+
+    callbacks = prepare_callbacks(
+        full_config,
+        tune.get_trial_dir(),
+        ds_val,
+    )
+
+    callbacks = callbacks[:-1]  # remove the CustomCallback at the end of the list
+
+    with strategy.scope():
+        lr_schedule, optim_callbacks = get_lr_schedule(full_config, steps=total_steps)
+        callbacks.append(optim_callbacks)
+        opt = get_optimizer(full_config, lr_schedule)
+
+        model = make_model(full_config, dtype=tf.dtypes.float32)
+
+        # Run model once to build the layers
+        model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
+
+        full_config = set_config_loss(full_config, full_config["setup"]["trainable"])
+        configure_model_weights(model, full_config["setup"]["trainable"])
+        model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"]))
 
-            callbacks.append(TuneReportCheckpointCallback(
+        loss_dict, loss_weights = get_loss_dict(full_config)
+        model.compile(
+            loss=loss_dict,
+            optimizer=opt,
+            sample_weight_mode="temporal",
+            loss_weights=loss_weights,
+            metrics={
+                "cls": [
+                    FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                    FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ]
+            },
+        )
+        model.summary()
+
+        callbacks.append(
+            TuneReportCheckpointCallback(
                 metrics=[
                     "adam_beta_1",
-                    'charge_loss',
+                    "charge_loss",
                     "cls_acc_unweighted",
                     "cls_loss",
                     "cos_phi_loss",
@@ -684,30 +673,30 @@ def build_model_and_train(config, checkpoint_dir=None, full_config=None, ntrain=
                     "val_loss",
                     "val_pt_loss",
                     "val_sin_phi_loss",
-                    ],
-                ),
+                ],
+            ),
+        )
+
+        try:
+            model.fit(
+                ds_train.repeat(),
+                validation_data=ds_test.repeat(),
+                epochs=full_config["setup"]["num_epochs"],
+                callbacks=callbacks,
+                steps_per_epoch=num_train_steps,
+                validation_steps=num_test_steps,
             )
+        except tf.errors.ResourceExhaustedError:
+            logging.warning("Resource exhausted, skipping this hyperparameter configuration.")
+            skiplog_file_path = Path(full_config["raytune"]["local_dir"]) / name / "skipped_configurations.txt"
+            lines = ["{}: {}\n".format(item[0], item[1]) for item in config.items()]
 
-            try:
-                fit_result = model.fit(
-                    ds_train.repeat(),
-                    validation_data=ds_test.repeat(),
-                    epochs=full_config["setup"]["num_epochs"],
-                    callbacks=callbacks,
-                    steps_per_epoch=num_train_steps,
-                    validation_steps=num_test_steps,
-                )
-            except tf.errors.ResourceExhaustedError:
-                logging.warning("Resource exhausted, skipping this hyperparameter configuration.")
-                skiplog_file_path = Path(full_config["raytune"]["local_dir"]) / name / "skipped_configurations.txt"
-                lines = ["{}: {}\n".format(item[0], item[1]) for item in config.items()]
-
-                with open(skiplog_file_path, "a") as f:
-                    f.write("#"*80 + "\n")
-                    for line in lines:
-                        f.write(line)
-                        logging.warning(line[:-1])
-                    f.write("#"*80 + "\n\n")
+            with open(skiplog_file_path, "a") as f:
+                f.write("#" * 80 + "\n")
+                for line in lines:
+                    f.write(line)
+                    logging.warning(line[:-1])
+                f.write("#" * 80 + "\n\n")
 
 
 @main.command()
@@ -726,7 +715,7 @@ def raytune(config, name, local, cpus, gpus, tune_result_dir, resume, ntrain, nt
     import ray
     from ray import tune
     from ray.tune.logger import TBXLoggerCallback
-    from raytune.search_space import search_space, raytune_num_samples
+    from raytune.search_space import raytune_num_samples, search_space
     from raytune.utils import get_raytune_schedule, get_raytune_search_alg
 
     if seeds:
@@ -748,12 +737,16 @@ def raytune(config, name, local, cpus, gpus, tune_result_dir, resume, ntrain, nt
 
     expdir = Path(cfg["raytune"]["local_dir"]) / name
     expdir.mkdir(parents=True, exist_ok=True)
-    shutil.copy("mlpf/raytune/search_space.py", str(Path(cfg["raytune"]["local_dir"]) / name / "search_space.py"))  # Copy the config file to the train dir for later reference
-    shutil.copy(config_file_path, str(Path(cfg["raytune"]["local_dir"]) / name / "config.yaml"))  # Copy the config file to the train dir for later reference
+    shutil.copy(
+        "mlpf/raytune/search_space.py", str(Path(cfg["raytune"]["local_dir"]) / name / "search_space.py")
+    )  # Copy the config file to the train dir for later reference
+    shutil.copy(
+        config_file_path, str(Path(cfg["raytune"]["local_dir"]) / name / "config.yaml")
+    )  # Copy the config file to the train dir for later reference
 
     ray.tune.ray_trial_executor.DEFAULT_GET_TIMEOUT = 1 * 60 * 60  # Avoid timeout errors
     if not local:
-        ray.init(address='auto')
+        ray.init(address="auto")
 
     sched = get_raytune_schedule(cfg["raytune"])
     search_alg = get_raytune_search_alg(cfg["raytune"], seeds)
@@ -780,8 +773,10 @@ def raytune(config, name, local, cpus, gpus, tune_result_dir, resume, ntrain, nt
     end = datetime.now()
     print("Total time of tune.run(...): {}".format(end - start))
 
-    print("Best hyperparameters found according to {} were: ".format(cfg["raytune"]["default_metric"]),
-        analysis.get_best_config(cfg["raytune"]["default_metric"], cfg["raytune"]["default_mode"]))
+    print(
+        "Best hyperparameters found according to {} were: ".format(cfg["raytune"]["default_metric"]),
+        analysis.get_best_config(cfg["raytune"]["default_metric"], cfg["raytune"]["default_mode"]),
+    )
 
     skip = 20
     if skip > cfg["setup"]["num_epochs"]:
@@ -822,36 +817,11 @@ def count_skipped(exp_dir):
 @click.option("--mode", help="experiment dir", type=str, default="min")
 def raytune_analysis(exp_dir, save, skip, mode, metric):
     from ray.tune import ExperimentAnalysis
+
     experiment_analysis = ExperimentAnalysis(exp_dir, default_metric=metric, default_mode=mode)
     plot_ray_analysis(experiment_analysis, save=save, skip=skip)
     analyze_ray_experiment(exp_dir, default_metric=metric, default_mode=mode)
 
-@main.command()
-@click.help_option("-h", "--help")
-@click.option("-c", "--config", help="configuration file", type=click.Path())
-@click.option("--ntrain", default=None, help="override the number of training events", type=int)
-@click.option("--ntest", default=None, help="override the number of testing events", type=int)
-def debug_data(config, ntrain, ntest):
-    """Train a model defined by config"""
-    config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
-        config, ntrain, ntest, weights=None,
-    )
-
-    dataset_def = get_dataset_def(config)
-    ds_train, ds_test, dataset_transform = get_train_val_datasets(config, global_batch_size=1, n_train=n_train, n_test=n_test)
-
-    # cand_counts = np.zeros(8)
-    # for data_item in tqdm(ds_train, desc="Counting"):
-    #     import pdb; pdb.set_trace()
-    #     cand_vals, cand_count = np.unique(np.argmax(data_item[1]['cls'], axis=2), return_counts=True)
-    #     cand_counts[cand_vals.astype("int32")] += cand_count
-    # print("cand_counts: ", cand_counts)
-
-    dsf = CMSDatasetFactory(config)
-    ds_train, _ = dsf.get_dataset(split="train")
-    ds_test, _ = dsf.get_dataset(split="test")
-    for data_item in tqdm(ds_train, desc="Counting"):
-        import pdb; pdb.set_trace()
 
 if __name__ == "__main__":
     main()
diff --git a/mlpf/tfmodel/data.py b/mlpf/tfmodel/data.py
deleted file mode 100644
index 9e69459c6..000000000
--- a/mlpf/tfmodel/data.py
+++ /dev/null
@@ -1,7 +0,0 @@
-
-class Dataset:
-    def __init__(self, **kwargs):
-        self.num_input_features = kwargs.get("num_input_features")
-        self.num_output_features = kwargs.get("num_output_features")
-        self.padded_num_elem_size = kwargs.get("padded_num_elem_size")
-        self.schema = kwargs.get("schema")
diff --git a/mlpf/tfmodel/datasets/BaseDatasetFactory.py b/mlpf/tfmodel/datasets/BaseDatasetFactory.py
index 0b0f9c6a9..7175d7e96 100644
--- a/mlpf/tfmodel/datasets/BaseDatasetFactory.py
+++ b/mlpf/tfmodel/datasets/BaseDatasetFactory.py
@@ -1,18 +1,17 @@
 import tensorflow as tf
-import tensorflow_datasets as tfds
-import heptfds
 
-#Unpacks a flat target array along the feature axis to a feature dict
-#the feature order is defined in the data prep stage (postprocessing2.py)
+
+# Unpacks a flat target array along the feature axis to a feature dict
+# the feature order is defined in the data prep stage (postprocessing2.py)
 def unpack_target(y, num_output_classes, config):
-    msk_pid = tf.cast(y[..., 0:1]!=0, tf.float32)
-    
-    pt = y[..., 2:3]*msk_pid
-    energy = y[..., 6:7]*msk_pid
-    eta = y[..., 3:4]*msk_pid
-    sin_phi = y[..., 4:5]*msk_pid
-    cos_phi = y[..., 5:6]*msk_pid
-    jet_idx = y[..., 7:8]*msk_pid
+    msk_pid = tf.cast(y[..., 0:1] != 0, tf.float32)
+
+    pt = y[..., 2:3] * msk_pid
+    energy = y[..., 6:7] * msk_pid
+    eta = y[..., 3:4] * msk_pid
+    sin_phi = y[..., 4:5] * msk_pid
+    cos_phi = y[..., 5:6] * msk_pid
+    jet_idx = y[..., 7:8] * msk_pid
 
     ret = {
         "cls": tf.one_hot(tf.cast(y[..., 0], tf.int32), num_output_classes),
@@ -30,6 +29,7 @@ def unpack_target(y, num_output_classes, config):
 
     return ret
 
+
 class BaseDatasetFactory:
     def __init__(self, config):
         self.cfg = config
@@ -38,35 +38,37 @@ def get_map_to_supervised(self):
         target_particles = self.cfg["dataset"]["target_particles"]
         num_output_classes = self.cfg["dataset"]["num_output_classes"]
         assert target_particles in ["gen", "cand"], "Target particles has to be 'cand' or 'gen'."
+
         def func(data_item):
             X = data_item["X"]
             y = data_item["y{}".format(target_particles)]
 
-            #mask to keep only nonzero elements
-            msk_elems = tf.cast(X[:, 0:1]!=0, tf.float32)
+            # mask to keep only nonzero elements
+            msk_elems = tf.cast(X[:, 0:1] != 0, tf.float32)
 
-            #mask to keep only nonzero target particles
-            msk_signal = tf.cast(y[:, 0:1]!=0, tf.float32)
+            # mask to keep only nonzero target particles
+            msk_signal = tf.cast(y[:, 0:1] != 0, tf.float32)
 
             target = unpack_target(y, num_output_classes, self.cfg)
 
-            #inputs: X
-            #targets: dict by classification (cls) and regression feature columns
-            #weights: dict of weights for each target
+            # inputs: X
+            # targets: dict by classification (cls) and regression feature columns
+            # weights: dict of weights for each target
             return (
                 X,
                 target,
                 {
                     "cls": msk_elems,
-                    "charge": msk_elems*msk_signal,
-                    "pt": msk_elems*msk_signal,
-                    "eta": msk_elems*msk_signal,
-                    "sin_phi": msk_elems*msk_signal,
-                    "cos_phi": msk_elems*msk_signal,
-                    "energy": msk_elems*msk_signal,
-                }
+                    "charge": msk_elems * msk_signal,
+                    "pt": msk_elems * msk_signal,
+                    "eta": msk_elems * msk_signal,
+                    "sin_phi": msk_elems * msk_signal,
+                    "cos_phi": msk_elems * msk_signal,
+                    "energy": msk_elems * msk_signal,
+                },
             )
+
         return func
-    
+
     def get_dataset(self, split, max_examples_per_split=None):
         raise NotImplementedError
diff --git a/mlpf/tfmodel/datasets/CMSDatasetFactory.py b/mlpf/tfmodel/datasets/CMSDatasetFactory.py
index 00ce69fae..8fdaddabd 100644
--- a/mlpf/tfmodel/datasets/CMSDatasetFactory.py
+++ b/mlpf/tfmodel/datasets/CMSDatasetFactory.py
@@ -1,9 +1,7 @@
-import tensorflow as tf
-import tensorflow_datasets as tfds
-import heptfds
-
 from tfmodel.datasets import BaseDatasetFactory
 
+import tensorflow_datasets as tfds
+
 
 class CMSDatasetFactory(BaseDatasetFactory):
     def get_dataset(self, dataset_name, dataset_dict, split, max_examples_per_split=None):
diff --git a/mlpf/tfmodel/datasets/DelphesDatasetFactory.py b/mlpf/tfmodel/datasets/DelphesDatasetFactory.py
index e3395afa2..67b1db3bd 100644
--- a/mlpf/tfmodel/datasets/DelphesDatasetFactory.py
+++ b/mlpf/tfmodel/datasets/DelphesDatasetFactory.py
@@ -1,9 +1,7 @@
-import tensorflow as tf
-import tensorflow_datasets as tfds
-import heptfds
-
 from tfmodel.datasets import BaseDatasetFactory
 
+import tensorflow_datasets as tfds
+
 
 class DelphesDatasetFactory(BaseDatasetFactory):
     def get_dataset(self, dataset_name, dataset_dict, split, max_examples_per_split=None):
diff --git a/mlpf/tfmodel/delphes_data.py b/mlpf/tfmodel/delphes_data.py
deleted file mode 100644
index 342ebb66e..000000000
--- a/mlpf/tfmodel/delphes_data.py
+++ /dev/null
@@ -1,179 +0,0 @@
-import numpy as np
-import glob
-import multiprocessing
-import os
-import pickle
-import bz2
-
-import tensorflow as tf
-
-#based on the dataset size distribution, divisible by 8
-padded_num_elem_size = 128*50
-
-#based on ntuplizer.py make_tower_array and make_track_array
-num_inputs = 12
-
-#based on ntuplizer.py make_gen_array
-num_outputs = 7
-
-def prepare_data(fname):
-
-    if fname.endswith(".pkl"):
-        data = pickle.load(open(fname, "rb"))
-    elif fname.endswith(".pkl.bz2"):
-        data = pickle.load(bz2.BZ2File(fname, "rb"))
-    else:
-        raise Exception("Unknown file: {}".format(fname))
-
-    #make all inputs and outputs the same size with padding
-    Xs = []
-    ygens = []
-    ycands = []
-    for i in range(len(data["X"])):
-        X = np.array(data["X"][i][:padded_num_elem_size], np.float32)
-        X = np.pad(X, [(0, padded_num_elem_size - X.shape[0]), (0,0)])
-
-        ygen = np.array(data["ygen"][i][:padded_num_elem_size], np.float32)
-        ygen = np.pad(ygen, [(0, padded_num_elem_size - ygen.shape[0]), (0,0)])
-
-        ycand = np.array(data["ycand"][i][:padded_num_elem_size], np.float32)
-        ycand = np.pad(ycand, [(0, padded_num_elem_size - ycand.shape[0]), (0,0)])
-
-        X = np.expand_dims(X, 0)
-        ygen = np.expand_dims(ygen, 0)
-        ycand = np.expand_dims(ycand, 0)
-        
-        Xs.append(X)
-        ygens.append(ygen)
-        ycands.append(ycand)
-
-    X = [np.concatenate(Xs)]
-    ygen = [np.concatenate(ygens)]
-    ycand = [np.concatenate(ycands)]
-    return X, ygen, ycand
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--datapath", type=str, required=True, help="Input data path")
-    parser.add_argument("--num-files-per-tfr", type=int, default=10, help="Number of pickle files to merge to one TFRecord file")
-    args = parser.parse_args()
-    return args
-
-def chunks(lst, n):
-    """Yield successive n-sized chunks from lst."""
-    for i in range(0, len(lst), n):
-        yield lst[i:i + n]
-
-#https://stackoverflow.com/questions/47861084/how-to-store-numpy-arrays-as-tfrecord
-def _bytes_feature(value):
-    """Returns a bytes_list from a string / byte."""
-    if isinstance(value, type(tf.constant(0))): # if value ist tensor
-        value = value.numpy() # get value of tensor
-    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-
-def _parse_tfr_element(element):
-    parse_dic = {
-        'X': tf.io.FixedLenFeature([], tf.string),
-        'y': tf.io.FixedLenFeature([], tf.string),
-        'w': tf.io.FixedLenFeature([], tf.string),
-    }
-    example_message = tf.io.parse_single_example(element, parse_dic)
-
-    X = example_message['X']
-    arr_X = tf.io.parse_tensor(X, out_type=tf.float32)
-    y = example_message['y']
-    arr_y = tf.io.parse_tensor(y, out_type=tf.float32)
-    w = example_message['w']
-    arr_w = tf.io.parse_tensor(w, out_type=tf.float32)
-    
-    #https://github.com/tensorflow/tensorflow/issues/24520#issuecomment-577325475
-    arr_X.set_shape(tf.TensorShape((None, num_inputs)))
-    arr_y.set_shape(tf.TensorShape((None, num_outputs)))
-    arr_w.set_shape(tf.TensorShape((None, )))
-    #inds = tf.stack([arr_dm_row, arr_dm_col], axis=-1)
-    #dm_sparse = tf.SparseTensor(values=arr_dm_data, indices=inds, dense_shape=[tf.shape(arr_X)[0], tf.shape(arr_X)[0]])
-
-    return arr_X, arr_y, arr_w
-
-def serialize_X_y_w(writer, X, y, w):
-    feature = {
-        'X': _bytes_feature(tf.io.serialize_tensor(X)),
-        'y': _bytes_feature(tf.io.serialize_tensor(y)),
-        'w': _bytes_feature(tf.io.serialize_tensor(w)),
-    }
-    sample = tf.train.Example(features=tf.train.Features(feature=feature))
-    writer.write(sample.SerializeToString())
-
-def serialize_chunk(args):
-    path, files, ichunk = args
-    out_filename = os.path.join(path, "chunk_{}.tfrecords".format(ichunk))
-    writer = tf.io.TFRecordWriter(out_filename)
-    Xs = []
-    ys = []
-    ws = []
-    dms = []
-
-    for fi in files:
-        X, y, _ = prepare_data(fi)
-
-        Xs += X
-        ys += y
-
-    Xs = np.concatenate(Xs)
-    ys = np.concatenate(ys)
-    assert(Xs.shape[2] == num_inputs)
-    assert(Xs.shape[1] == padded_num_elem_size)
-    assert(ys.shape[2] == num_outputs)
-    assert(ys.shape[1] == padded_num_elem_size)
-
-    #set weights for each sample to be equal to the number of samples of this type
-    #in the training script, this can be used to compute either inverse or class-balanced weights
-    uniq_vals, uniq_counts = np.unique(np.concatenate([y[:, 0] for y in ys]), return_counts=True)
-    for i in range(len(ys)):
-        w = np.ones(len(ys[i]), dtype=np.float32)
-        for uv, uc in zip(uniq_vals, uniq_counts):
-            w[ys[i][:, 0]==uv] = uc
-        ws += [w]
-
-    for X, y, w in zip(Xs, ys, ws):
-        serialize_X_y_w(writer, X, y, w)
-    
-    print(out_filename)
-    writer.close()
-
-if __name__ == "__main__":
-    args = parse_args()
-    tf.config.experimental_run_functions_eagerly(True)
-
-    datapath = args.datapath
-
-    filelist = sorted(glob.glob("{}/*.pkl.bz2".format(datapath)))
-    print("found {} files".format(len(filelist)))
-    #means, stds = extract_means_stds(filelist)
-    outpath = "{}/tfr".format(datapath)
-
-    if not os.path.isdir(outpath):
-        os.makedirs(outpath)
-
-    pars = []
-    for ichunk, files in enumerate(chunks(filelist, args.num_files_per_tfr)):
-        pars += [(outpath, files, ichunk)]
-    #serialize_chunk(pars[0])
-    pool = multiprocessing.Pool(8)
-    pool.map(serialize_chunk, pars)
-    #for chunk in pars:
-    #    serialize_chunk(chunk)
-
-
-    #Load and test the dataset 
-    tfr_dataset = tf.data.TFRecordDataset(glob.glob(outpath + "/*.tfrecords"))
-    dataset = tfr_dataset.map(_parse_tfr_element)
-    num_ev = 0
-    num_particles = 0
-    for X, y, w in dataset:
-        num_ev += 1
-        num_particles += len(X)
-        
-    print("Created TFRecords dataset in {} with {} events, {} particles".format(
-        datapath, num_ev, num_particles))
diff --git a/mlpf/tfmodel/fast_attention.py b/mlpf/tfmodel/fast_attention.py
deleted file mode 100644
index 204f00e78..000000000
--- a/mlpf/tfmodel/fast_attention.py
+++ /dev/null
@@ -1,486 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers.
-
-Prefix Sum Tensorflow implementation by Valerii Likhosherstov.
-
-Minor modifications for TF 2.3 by Joosep Pata:
- - remove seed, use tf.concat in create_projection_matrix, 
-"""
-from . import fast_attention_util as util
-import math
-import tensorflow as tf
-
-BIG_CONSTANT = 1e8
-
-@tf.function
-def create_projection_matrix(m, d, seed=0, scaling=0, dtype=tf.float32):
-  r"""Constructs the matrix of random projections.
-
-  Constructs a matrix of random orthogonal projections. Each projection vector
-  has direction chosen uniformly at random and either deterministic length
-  \sqrt{d} or length taken from the \chi(d) distribution (in the latter case
-  marginal distributions of the projections are d-dimensional Gaussian vectors
-  with associated identity covariance matrix).
-
-  Args:
-    m: number of random projections.
-    d: dimensionality of each random projection.
-    seed: random seed used to construct projections.
-    scaling: 1 if all the random projections need to be renormalized to have
-      length \sqrt{d}, 0 if the lengths of random projections should follow
-      \chi(d) distribution.
-
-  Returns:
-    The matrix of random projections of the shape [m, d].
-  """
-  nb_full_blocks = int(m / d)
-  block_list = []
-  #current_seed = tf.constant(seed)
-  for iblock in range(nb_full_blocks):
-    unstructured_block = tf.random.normal((d, d), dtype=dtype)
-    q, _ = tf.linalg.qr(unstructured_block)
-    q = tf.transpose(q)
-    block_list.append(q)
-    #current_seed += 1
-  remaining_rows = m - nb_full_blocks * d
-  if remaining_rows > 0:
-    unstructured_block = tf.random.normal((d, d), dtype=dtype)
-    q, _ = tf.linalg.qr(unstructured_block)
-    q = tf.transpose(q)
-    block_list.append(q[0:remaining_rows])
-  final_matrix = tf.concat(block_list, axis=0)
-  #current_seed += 1
-
-  if scaling == 0:
-    multiplier = tf.norm(tf.random.normal((m, d), dtype=dtype), axis=1)
-  elif scaling == 1:
-    multiplier = tf.math.sqrt(float(d)) * tf.ones((m))
-  else:
-    raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)
-
-  ret = tf.linalg.matmul(tf.linalg.diag(multiplier), final_matrix)
-  return ret
-
-def relu_kernel_transformation(data,
-                               is_query,
-                               projection_matrix=None,
-                               numerical_stabilizer=0.001):
-  """Computes features for the ReLU-kernel.
-
-  Computes random features for the ReLU kernel from
-  https://arxiv.org/pdf/2009.14794.pdf.
-
-  Args:
-    data: input data tensor of the shape [B, L, H, D], where: B - batch
-      dimension, L - attention dimensions, H - heads, D - features.
-    is_query: indicates whether input data is a query oor key tensor.
-    projection_matrix: random Gaussian matrix of shape [M, D], where M stands
-      for the number of random features and each D x D sub-block has pairwise
-      orthogonal rows.
-    numerical_stabilizer: small positive constant for numerical stability.
-
-  Returns:
-    Corresponding kernel feature map.
-  """
-  del is_query
-  if projection_matrix is None:
-    return tf.nn.relu(data) + numerical_stabilizer
-  else:
-    ratio = 1.0 / tf.math.sqrt(
-        tf.dtypes.cast(projection_matrix.shape[0], data.dtype))
-    data_dash = ratio * tf.einsum("blhd,md->blhm", data, projection_matrix)
-    return tf.nn.relu(data_dash) + numerical_stabilizer
-
-
-def softmax_kernel_transformation(data,
-                                  is_query,
-                                  projection_matrix=None,
-                                  numerical_stabilizer=0.000001):
-  """Computes random features for the softmax kernel using FAVOR+ mechanism.
-
-  Computes random features for the softmax kernel using FAVOR+ mechanism from
-  https://arxiv.org/pdf/2009.14794.pdf.
-
-  Args:
-    data: input data tensor of the shape [B, L, H, D], where: B - batch
-      dimension, L - attention dimensions, H - heads, D - features.
-    is_query: indicates whether input data is a query oor key tensor.
-    projection_matrix: random Gaussian matrix of shape [M, D], where M stands
-      for the number of random features and each D x D sub-block has pairwise
-      orthogonal rows.
-    numerical_stabilizer: small positive constant for numerical stability.
-
-  Returns:
-    Corresponding kernel feature map.
-  """
-  data_normalizer = 1.0 / (
-      tf.math.sqrt(tf.math.sqrt(tf.dtypes.cast(data.shape[-1], tf.float32))))
-  ratio = 1.0 / tf.math.sqrt(
-      tf.dtypes.cast(projection_matrix.shape[0], tf.float32))
-  data_dash = tf.einsum("blhd,md->blhm", data, projection_matrix)
-  diag_data = tf.math.square(data)
-  diag_data = tf.math.reduce_sum(
-      diag_data, axis=tf.keras.backend.ndim(data) - 1)
-  diag_data = (diag_data / 2.0) * data_normalizer * data_normalizer
-  diag_data = tf.expand_dims(diag_data, axis=tf.keras.backend.ndim(data) - 1)
-  if is_query:
-    last_dims_t = (len(data_dash.shape) - 1,)
-    data_dash = ratio * (
-        tf.math.exp(data_dash - diag_data - tf.math.reduce_max(
-            data_dash, axis=last_dims_t, keepdims=True)) + numerical_stabilizer)
-  else:
-    data_dash = ratio * (
-        tf.math.exp(data_dash - diag_data - tf.math.reduce_max(data_dash)) +
-        numerical_stabilizer)
-
-  return data_dash
-
-
-def noncausal_numerator(qs, ks, vs):
-  """Computes not-normalized FAVOR noncausal attention AV.
-
-  Args:
-    qs: query_prime tensor of the shape [L,B,H,M].
-    ks: key_prime tensor of the shape [L,B,H,M].
-    vs: value tensor of the shape [L,B,H,D].
-
-  Returns:
-    Not-normalized FAVOR noncausal attention AV.
-  """
-  kvs = tf.clip_by_value(tf.einsum("lbhm,lbhd->bhmd", ks, vs), -1e4, 1e4)
-  return tf.clip_by_value(tf.einsum("lbhm,bhmd->lbhd", qs, kvs), -1e4, 1e4)
-
-
-def noncausal_denominator(qs, ks):
-  """Computes FAVOR normalizer in noncausal attention.
-
-  Args:
-    qs: query_prime tensor of the shape [L,B,H,M].
-    ks: key_prime tensor of the shape [L,B,H,M].
-
-  Returns:
-    FAVOR normalizer in noncausal attention.
-  """
-  all_ones = tf.ones([ks.shape[0]], dtype=qs.dtype)
-  ks_sum = tf.clip_by_value(tf.einsum("lbhm,l->bhm", ks, all_ones), -1e-4, 1e4)
-  return tf.clip_by_value(tf.einsum("lbhm,bhm->lbh", qs, ks_sum), -1e-4, 1e4)
-
-
-@tf.custom_gradient
-def causal_numerator(qs, ks, vs):
-  """Computes not-normalized FAVOR causal attention A_{masked}V.
-
-  Args:
-    qs: query_prime tensor of the shape [L,B,H,M].
-    ks: key_prime tensor of the shape [L,B,H,M].
-    vs: value tensor of the shape [L,B,H,D].
-
-  Returns:
-    Not-normalized FAVOR causal attention A_{masked}V.
-  """
-
-  result = []
-  sums = tf.zeros_like(tf.einsum("ijk,ijl->ijkl", ks[0], vs[0]))
-
-  for index in range(qs.shape[0]):
-    sums = sums + tf.einsum("ijk,ijl->ijkl", ks[index], vs[index])
-    result.append(tf.einsum("ijkl,ijk->ijl", sums, qs[index])[None, Ellipsis])
-
-  result = tf.concat(result, axis=0)
-
-  def grad(res_grad):
-
-    grads = tf.zeros_like(tf.einsum("ijk,ijl->ijkl", ks[0], vs[0]))
-
-    gr_sums = sums
-
-    q_grads = []
-    k_grads = []
-    v_grads = []
-
-    for index in range(qs.shape[0] - 1, -1, -1):
-
-      q_grads.append(
-          tf.einsum("ijkl,ijl->ijk", gr_sums, res_grad[index])[None, Ellipsis])
-      grads = grads + tf.einsum("ijk,ijl->ijkl", qs[index], res_grad[index])
-      k_grads.append(tf.einsum("ijkl,ijl->ijk", grads, vs[index])[None, Ellipsis])
-      v_grads.append(tf.einsum("ijkl,ijk->ijl", grads, ks[index])[None, Ellipsis])
-      gr_sums = gr_sums - tf.einsum("ijk,ijl->ijkl", ks[index], vs[index])
-
-    q_grads = tf.concat(q_grads[::-1], axis=0)
-    k_grads = tf.concat(k_grads[::-1], axis=0)
-    v_grads = tf.concat(v_grads[::-1], axis=0)
-
-    return q_grads, k_grads, v_grads
-
-  return result, grad
-
-
-@tf.custom_gradient
-def causal_denominator(qs, ks):
-  """Computes FAVOR normalizer in causal attention.
-
-  Args:
-    qs: query_prime tensor of the shape [L,B,H,M].
-    ks: key_prime tensor of the shape [L,B,H,M].
-
-  Returns:
-    FAVOR normalizer in causal attention.
-  """
-
-  result = []
-  sums = tf.zeros_like(ks[0])
-
-  for index in range(qs.shape[0]):
-    sums = sums + ks[index]
-    result.append(tf.reduce_sum(qs[index] * sums, axis=2)[None, Ellipsis])
-
-  result = tf.concat(result, axis=0)
-
-  def grad(res_grad):
-
-    k_grad = tf.zeros_like(ks[0])
-
-    gr_sums = sums
-
-    q_grads = []
-    k_grads = []
-
-    for index in range(qs.shape[0] - 1, -1, -1):
-
-      q_grads.append(
-          tf.einsum("ijk,ij->ijk", gr_sums, res_grad[index])[None, Ellipsis])
-      k_grad = k_grad + tf.einsum("ijk,ij->ijk", qs[index], res_grad[index])
-      k_grads.append(k_grad[None, Ellipsis])
-      gr_sums = gr_sums - ks[index]
-
-    q_grads = tf.concat(q_grads[::-1], axis=0)
-    k_grads = tf.concat(k_grads[::-1], axis=0)
-
-    return q_grads, k_grads
-
-  return result, grad
-
-
-def favor_attention(query,
-                    key,
-                    value,
-                    kernel_transformation,
-                    causal,
-                    projection_matrix=None):
-  """Computes FAVOR normalized attention.
-
-  Args:
-    query: query tensor.
-    key: key tensor.
-    value: value tensor.
-    kernel_transformation: transformation used to get finite kernel features.
-    causal: whether attention is causal or not.
-    projection_matrix: projection matrix to be used.
-
-  Returns:
-    FAVOR normalized attention.
-  """
-  query_prime = kernel_transformation(query, True,
-                                      projection_matrix)  # [B,L,H,M]
-  key_prime = kernel_transformation(key, False, projection_matrix)  # [B,L,H,M]
-  query_prime = tf.transpose(query_prime, [1, 0, 2, 3])  # [L,B,H,M]
-  key_prime = tf.transpose(key_prime, [1, 0, 2, 3])  # [L,B,H,M]
-  value = tf.transpose(value, [1, 0, 2, 3])  # [L,B,H,D]
-
-  if causal:
-    av_attention = causal_numerator(query_prime, key_prime, value)
-    attention_normalizer = causal_denominator(query_prime, key_prime)
-  else:
-    av_attention = noncausal_numerator(query_prime, key_prime, value)
-    attention_normalizer = noncausal_denominator(query_prime, key_prime)
-  # TODO(kchoro): Add more comments.
-  av_attention = tf.transpose(av_attention, [1, 0, 2, 3])
-  attention_normalizer = tf.transpose(attention_normalizer, [1, 0, 2])
-  attention_normalizer = tf.expand_dims(attention_normalizer,
-                                        len(attention_normalizer.shape))
-  return av_attention / attention_normalizer
-
-
-class Attention(tf.keras.layers.Layer):
-  """Multi-headed attention layer."""
-
-  def __init__(self,
-               hidden_size,
-               num_heads,
-               attention_dropout,
-               kernel_transformation=relu_kernel_transformation,
-               numerical_stabilizer=0.001,
-               causal=False,
-               projection_matrix_type=None,
-               nb_random_features=0):
-    """Initialize Attention.
-
-    Args:
-      hidden_size: int, output dim of hidden layer.
-      num_heads: int, number of heads to repeat the same attention structure.
-      attention_dropout: float, dropout rate inside attention for training.
-      kernel_transformation: transformation used to produce kernel features for
-        attention.
-      numerical_stabilizer: used to bound away from zero kernel values.
-      causal: whether attention is causal or not.
-      projection_matrix_type: None if Identity should be used, otherwise random
-        projection matrix will be applied.
-      nb_random_features: number of random features to be used (relevant only if
-        projection_matrix is not None).
-    """
-    if hidden_size % num_heads:
-      raise ValueError(
-          "Hidden size ({}) must be divisible by the number of heads ({})."
-          .format(hidden_size, num_heads))
-
-    super(Attention, self).__init__()
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.attention_dropout = attention_dropout
-    self.kernel_transformation = kernel_transformation
-    self.numerical_stabilizer = numerical_stabilizer
-    self.causal = causal
-    self.projection_matrix_type = projection_matrix_type
-    self.nb_random_features = nb_random_features
-
-  def build(self, input_shape):
-    """Builds the layer."""
-    # Layers for linearly projecting the queries, keys, and values.
-    size_per_head = self.hidden_size // self.num_heads
-
-    def _glorot_initializer(fan_in, fan_out):
-      limit = math.sqrt(6.0 / (fan_in + fan_out))
-      return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
-
-    attention_initializer = _glorot_initializer(input_shape.as_list()[-1],
-                                                self.hidden_size)
-    self.query_dense_layer = util.DenseEinsum(
-        output_shape=(self.num_heads, size_per_head),
-        kernel_initializer=attention_initializer,
-        use_bias=False,
-        name="query")
-    self.key_dense_layer = util.DenseEinsum(
-        output_shape=(self.num_heads, size_per_head),
-        kernel_initializer=attention_initializer,
-        use_bias=False,
-        name="key")
-    self.value_dense_layer = util.DenseEinsum(
-        output_shape=(self.num_heads, size_per_head),
-        kernel_initializer=attention_initializer,
-        use_bias=False,
-        name="value")
-
-    output_initializer = _glorot_initializer(self.hidden_size, self.hidden_size)
-    self.output_dense_layer = util.DenseEinsum(
-        output_shape=self.hidden_size,
-        num_summed_dimensions=2,
-        kernel_initializer=output_initializer,
-        use_bias=False,
-        name="output_transform")
-    super(Attention, self).build(input_shape)
-
-  def get_config(self):
-    return {
-        "hidden_size": self.hidden_size,
-        "num_heads": self.num_heads,
-        "attention_dropout": self.attention_dropout,
-    }
-
-  def call(self,
-           query_input,
-           source_input,
-           bias,
-           training,
-           cache=None,
-           decode_loop_step=None):
-    """Apply attention mechanism to query_input and source_input.
-
-    Args:
-      query_input: A tensor with shape [batch_size, length_query, hidden_size].
-      source_input: A tensor with shape [batch_size, length_source,
-        hidden_size].
-      bias: A tensor with shape [batch_size, 1, length_query, length_source],
-        the attention bias that will be added to the result of the dot product.
-      training: A bool, whether in training mode or not.
-      cache: (Used during prediction) A dictionary with tensors containing
-        results of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, heads, dim_per_head],
-             "v": tensor with shape [batch_size, i, heads, dim_per_head]} where
-               i is the current decoded length for non-padded decode, or max
-               sequence length for padded decode.
-      decode_loop_step: An integer, step number of the decoding loop. Used only
-        for autoregressive inference on TPU.
-
-    Returns:
-      Attention layer output with shape [batch_size, length_query, hidden_size]
-    """
-    # Linearly project the query, key and value using different learned
-    # projections. Splitting heads is automatically done during the linear
-    # projections --> [batch_size, length, num_heads, dim_per_head].
-    query = self.query_dense_layer(query_input)
-    key = self.key_dense_layer(source_input)
-    value = self.value_dense_layer(source_input)
-
-    if self.projection_matrix_type is None:
-      projection_matrix = None
-    else:
-      dim = query.shape[-1]
-      seed = tf.math.ceil(tf.math.abs(tf.math.reduce_sum(query) * BIG_CONSTANT))
-      seed = tf.dtypes.cast(seed, tf.int32)
-      projection_matrix = create_projection_matrix(
-          self.nb_random_features, dim, seed=seed, dtype=query_input.dtype)
-
-    if cache is not None:
-      # Combine cached keys and values with new keys and values.
-      if decode_loop_step is not None:
-        cache_k_shape = cache["k"].shape.as_list()
-        indices = tf.reshape(
-            tf.one_hot(decode_loop_step, cache_k_shape[1], dtype=key.dtype),
-            [1, cache_k_shape[1], 1, 1])
-        key = cache["k"] + key * indices
-        cache_v_shape = cache["v"].shape.as_list()
-        indices = tf.reshape(
-            tf.one_hot(decode_loop_step, cache_v_shape[1], dtype=value.dtype),
-            [1, cache_v_shape[1], 1, 1])
-        value = cache["v"] + value * indices
-      else:
-        key = tf.concat([tf.cast(cache["k"], key.dtype), key], axis=1)
-        value = tf.concat([tf.cast(cache["v"], value.dtype), value], axis=1)
-
-      # Update cache
-      cache["k"] = key
-      cache["v"] = value
-
-    attention_output = favor_attention(query, key, value,
-                                       self.kernel_transformation, self.causal,
-                                       projection_matrix)
-    attention_output = self.output_dense_layer(attention_output)
-    return attention_output
-
-
-class SelfAttention(Attention):
-  """Multiheaded self-attention layer."""
-
-  def call(self,
-           query_input,
-           bias,
-           training,
-           cache=None,
-           decode_loop_step=None):
-    return super(SelfAttention, self).call(query_input, query_input, bias,
-                                           training, cache, decode_loop_step)
diff --git a/mlpf/tfmodel/fast_attention_util.py b/mlpf/tfmodel/fast_attention_util.py
deleted file mode 100644
index c29ff4550..000000000
--- a/mlpf/tfmodel/fast_attention_util.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras-based einsum layer.
-
-Copied from
-https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/dense_einsum.py.
-"""
-# pylint: disable=g-classes-have-attributes
-
-import tensorflow as tf
-
-_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class DenseEinsum(tf.keras.layers.Layer):
-  """A densely connected layer that uses tf.einsum as the backing computation.
-
-  This layer can perform einsum calculations of arbitrary dimensionality.
-
-  Arguments:
-    output_shape: Positive integer or tuple, dimensionality of the output space.
-    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
-      matmul should use 1, 3D matmul should use 2, and so forth.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation")..
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-  Input shape:
-    N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common
-      situation would be a 2D input with shape `(batch_size, input_dim)`.
-  Output shape:
-    N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D
-      input with shape `(batch_size, input_dim)`, the output would have shape
-      `(batch_size, units)`.
-  """
-
-  def __init__(self,
-               output_shape,
-               num_summed_dimensions=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DenseEinsum, self).__init__(**kwargs)
-    self._output_shape = output_shape if isinstance(
-        output_shape, (list, tuple)) else (output_shape,)
-    self._activation = tf.keras.activations.get(activation)
-    self._use_bias = use_bias
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._num_summed_dimensions = num_summed_dimensions
-    self._einsum_string = None
-
-  def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
-    input_str = ""
-    kernel_str = ""
-    output_str = ""
-    letter_offset = 0
-    for i in range(free_input_dims):
-      char = _CHR_IDX[i + letter_offset]
-      input_str += char
-      output_str += char
-
-    letter_offset += free_input_dims
-    for i in range(bound_dims):
-      char = _CHR_IDX[i + letter_offset]
-      input_str += char
-      kernel_str += char
-
-    letter_offset += bound_dims
-    for i in range(output_dims):
-      char = _CHR_IDX[i + letter_offset]
-      kernel_str += char
-      output_str += char
-
-    return input_str + "," + kernel_str + "->" + output_str
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    input_rank = input_shape.rank
-    free_input_dims = input_rank - self._num_summed_dimensions
-    output_dims = len(self._output_shape)
-
-    self._einsum_string = self._build_einsum_string(free_input_dims,
-                                                    self._num_summed_dimensions,
-                                                    output_dims)
-
-    # This is only saved for testing purposes.
-    self._kernel_shape = (
-        input_shape[free_input_dims:].concatenate(self._output_shape))
-
-    self._kernel = self.add_weight(
-        "kernel",
-        shape=self._kernel_shape,
-        initializer=self._kernel_initializer,
-        regularizer=self._kernel_regularizer,
-        constraint=self._kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-    if self._use_bias:
-      self._bias = self.add_weight(
-          "bias",
-          shape=self._output_shape,
-          initializer=self._bias_initializer,
-          regularizer=self._bias_regularizer,
-          constraint=self._bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self._bias = None
-    super(DenseEinsum, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "output_shape":
-            self._output_shape,
-        "num_summed_dimensions":
-            self._num_summed_dimensions,
-        "activation":
-            tf.keras.activations.serialize(self._activation),
-        "use_bias":
-            self._use_bias,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint)
-    }
-    base_config = super(DenseEinsum, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    ret = tf.einsum(self._einsum_string, inputs, self._kernel)
-    if self._use_bias:
-      ret += self._bias
-    if self._activation is not None:
-      ret = self._activation(ret)
-    return ret
diff --git a/mlpf/tfmodel/model.py b/mlpf/tfmodel/model.py
index 418c5492e..d4036f288 100644
--- a/mlpf/tfmodel/model.py
+++ b/mlpf/tfmodel/model.py
@@ -1,19 +1,17 @@
-# This file contains the generic MLPF model definitions
-# PFNetDense: the GNN-based model with graph building based on LSH and a Gaussian distance kernel
-# PFNetTransformer: the transformer-based model using fast attention
-
 import tensorflow as tf
-from tfmodel.utils import batched_histogram_2d
 
+# FIXME: this should be configurable
 regularizer_weight = 0.0
 
+
 def split_indices_to_bins(cmul, nbins, bin_size):
     bin_idx = tf.argmax(cmul, axis=-1)
     bins_split = tf.reshape(tf.argsort(bin_idx), (nbins, bin_size))
     return bins_split
 
+
 def split_indices_to_bins_batch(cmul, nbins, bin_size, msk):
-    bin_idx = tf.argmax(cmul, axis=-1) + tf.cast(tf.where(~msk, nbins-1, 0), tf.int64)
+    bin_idx = tf.argmax(cmul, axis=-1) + tf.cast(tf.where(~msk, nbins - 1, 0), tf.int64)
     bins_split = tf.reshape(tf.argsort(bin_idx), (tf.shape(cmul)[0], nbins, bin_size))
     return bins_split
 
@@ -28,39 +26,42 @@ def pairwise_l2_dist(A, B):
 
     # return pairwise euclidean difference matrix
     # note that this matrix multiplication can go out of range for float16 in case the absolute values of A and B are large
-    D = tf.sqrt(tf.maximum(na - 2*tf.matmul(A, B, False, True) + nb, 1e-6))
+    D = tf.sqrt(tf.maximum(na - 2 * tf.matmul(A, B, False, True) + nb, 1e-6))
     return D
 
+
 def pairwise_l1_dist(A, B):
     na = tf.expand_dims(A, -2)
     nb = tf.expand_dims(B, -3)
-    D = tf.abs(tf.reduce_sum(na-nb, axis=-1))
+    D = tf.abs(tf.reduce_sum(na - nb, axis=-1))
     return D
 
+
 def pairwise_learnable_dist(A, B, ffn, training=False):
     shp = tf.shape(A)
 
-    #stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
+    # stack node feature vectors of src[i], dst[j] into a matrix res[i,j] = (src[i], dst[j])
     mg = tf.meshgrid(tf.range(shp[0]), tf.range(shp[1]), tf.range(shp[2]), tf.range(shp[2]), indexing="ij")
-    inds1 = tf.stack([mg[0],mg[1],mg[2]], axis=-1)
-    inds2 = tf.stack([mg[0],mg[1],mg[3]], axis=-1)
-    res = tf.concat([
-        tf.gather_nd(A, inds1),
-        tf.gather_nd(B, inds2)], axis=-1
-    ) #(batch, bin, elem, elem, feat)
-
-    #run a feedforward net on ffn(src, dst) -> output_dim
+    inds1 = tf.stack([mg[0], mg[1], mg[2]], axis=-1)
+    inds2 = tf.stack([mg[0], mg[1], mg[3]], axis=-1)
+    res = tf.concat([tf.gather_nd(A, inds1), tf.gather_nd(B, inds2)], axis=-1)  # (batch, bin, elem, elem, feat)
+
+    # run a feedforward net on ffn(src, dst) -> output_dim
     res_transformed = ffn(res, training=training)
 
     return res_transformed
 
+
 def pairwise_sigmoid_dist(A, B):
-    return tf.nn.sigmoid(tf.matmul(A, tf.transpose(B, perm=[0,2,1])))
+    return tf.nn.sigmoid(tf.matmul(A, tf.transpose(B, perm=[0, 2, 1])))
+
 
 """
 sp_a: (nbatch, nelem, nelem) sparse distance matrices
 b: (nbatch, nelem, ncol) dense per-element feature matrices
 """
+
+
 def sparse_dense_matmult_batch(sp_a, b):
 
     dtype = b.dtype
@@ -71,41 +72,39 @@ def sparse_dense_matmult_batch(sp_a, b):
     def map_function(x):
         i, dense_slice = x[0], x[1]
         num_points = tf.shape(b)[1]
-        sparse_slice = tf.sparse.reshape(tf.sparse.slice(
-            tf.cast(sp_a, tf.float32), [i, 0, 0], [1, num_points, num_points]),
-            [num_points, num_points])
+        sparse_slice = tf.sparse.reshape(
+            tf.sparse.slice(tf.cast(sp_a, tf.float32), [i, 0, 0], [1, num_points, num_points]), [num_points, num_points]
+        )
         mult_slice = tf.sparse.sparse_dense_matmul(sparse_slice, dense_slice)
         return mult_slice
 
     elems = (tf.range(0, num_batches, delta=1, dtype=tf.int64), b)
     ret = tf.map_fn(map_function, elems, fn_output_signature=tf.TensorSpec((None, None), b.dtype), back_prop=True)
-    return tf.cast(ret, dtype) 
+    return tf.cast(ret, dtype)
+
 
 @tf.function
 def reverse_lsh(bins_split, points_binned_enc):
     # batch_dim = points_binned_enc.shape[0]
     # n_points = points_binned_enc.shape[1]*points_binned_enc.shape[2]
     # n_features = points_binned_enc.shape[-1]
-    
+
     shp = tf.shape(points_binned_enc)
     batch_dim = shp[0]
-    n_points = shp[1]*shp[2]
+    n_points = shp[1] * shp[2]
     n_features = shp[-1]
 
     bins_split_flat = tf.reshape(bins_split, (batch_dim, n_points))
     points_binned_enc_flat = tf.reshape(points_binned_enc, (batch_dim, n_points, n_features))
-    
+
     batch_inds = tf.reshape(tf.repeat(tf.range(batch_dim), n_points), (batch_dim, n_points))
     bins_split_flat_batch = tf.stack([batch_inds, bins_split_flat], axis=-1)
 
-    ret = tf.scatter_nd(
-        bins_split_flat_batch,
-        points_binned_enc_flat,
-        shape=(batch_dim, n_points, n_features)
-    )
-        
+    ret = tf.scatter_nd(bins_split_flat_batch, points_binned_enc_flat, shape=(batch_dim, n_points, n_features))
+
     return ret
 
+
 class InputEncoding(tf.keras.layers.Layer):
     def __init__(self, num_input_classes):
         super(InputEncoding, self).__init__()
@@ -113,17 +112,19 @@ def __init__(self, num_input_classes):
 
     """
         X: [Nbatch, Nelem, Nfeat] array of all the input detector element feature data
-    """        
+    """
+
     @tf.function
     def call(self, X):
 
-        #X[:, :, 0] - categorical index of the element type
+        # X[:, :, 0] - categorical index of the element type
         Xid = tf.cast(tf.one_hot(tf.cast(X[:, :, 0], tf.int32), self.num_input_classes), dtype=X.dtype)
 
-        #X[:, :, 1:] - all the other non-categorical features
+        # X[:, :, 1:] - all the other non-categorical features
         Xprop = X[:, :, 1:]
         return tf.concat([Xid, Xprop], axis=-1)
 
+
 """
 For the CMS dataset, precompute additional features:
 - log of pt and energy
@@ -131,6 +132,8 @@ def call(self, X):
 - sin, cos of phi angles
 - scale layer and depth values (small integers) to a larger dynamic range
 """
+
+
 class InputEncodingCMS(tf.keras.layers.Layer):
     def __init__(self, num_input_classes):
         super(InputEncodingCMS, self).__init__()
@@ -138,9 +141,10 @@ def __init__(self, num_input_classes):
 
     """
         X: [Nbatch, Nelem, Nfeat] array of all the input detector element feature data
-    """        
+    """
+
     def call(self, X):
-        #X[:, :, 0] - categorical index of the element type
+        # X[:, :, 0] - categorical index of the element type
         Xid = tf.cast(tf.one_hot(tf.cast(X[:, :, 0], tf.int32), self.num_input_classes), dtype=X.dtype)
         Xpt = tf.expand_dims(tf.math.log(X[:, :, 1] + 1.0), axis=-1)
         Xe = tf.expand_dims(tf.math.log(X[:, :, 4] + 1.0), axis=-1)
@@ -162,18 +166,30 @@ def call(self, X):
         Xphi_hcal1 = tf.expand_dims(tf.sin(X[:, :, 12]), axis=-1)
         Xphi_hcal2 = tf.expand_dims(tf.cos(X[:, :, 12]), axis=-1)
 
-        return tf.concat([
-            Xid,
-            Xpt, Xpt_0p5, Xpt_2,
-            Xeta1, Xeta2,
-            Xabs_eta,
-            Xphi1, Xphi2,
-            Xe, Xe_0p5, Xe_2,
-            Xphi_ecal1, Xphi_ecal2,
-            Xphi_hcal1, Xphi_hcal2,
-            X], axis=-1
+        return tf.concat(
+            [
+                Xid,
+                Xpt,
+                Xpt_0p5,
+                Xpt_2,
+                Xeta1,
+                Xeta2,
+                Xabs_eta,
+                Xphi1,
+                Xphi2,
+                Xe,
+                Xe_0p5,
+                Xe_2,
+                Xphi_ecal1,
+                Xphi_ecal2,
+                Xphi_hcal1,
+                Xphi_hcal2,
+                X,
+            ],
+            axis=-1,
         )
 
+
 class GHConvDense(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         self.activation = getattr(tf.keras.activations, kwargs.pop("activation"))
@@ -185,40 +201,66 @@ def __init__(self, *args, **kwargs):
     def build(self, input_shape):
         self.hidden_dim = input_shape[0][-1]
         self.nelem = input_shape[0][-2]
-        self.W_t = self.add_weight(shape=(self.hidden_dim, self.output_dim), name="w_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.b_t = self.add_weight(shape=(self.output_dim,), name="b_t", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.W_h = self.add_weight(shape=(self.hidden_dim, self.output_dim), name="w_h", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
-        self.theta = self.add_weight(shape=(self.hidden_dim, self.output_dim), name="theta", initializer="random_normal", trainable=True, regularizer=tf.keras.regularizers.L1(regularizer_weight))
- 
+        self.W_t = self.add_weight(
+            shape=(self.hidden_dim, self.output_dim),
+            name="w_t",
+            initializer="random_normal",
+            trainable=True,
+            regularizer=tf.keras.regularizers.L1(regularizer_weight),
+        )
+        self.b_t = self.add_weight(
+            shape=(self.output_dim,),
+            name="b_t",
+            initializer="random_normal",
+            trainable=True,
+            regularizer=tf.keras.regularizers.L1(regularizer_weight),
+        )
+        self.W_h = self.add_weight(
+            shape=(self.hidden_dim, self.output_dim),
+            name="w_h",
+            initializer="random_normal",
+            trainable=True,
+            regularizer=tf.keras.regularizers.L1(regularizer_weight),
+        )
+        self.theta = self.add_weight(
+            shape=(self.hidden_dim, self.output_dim),
+            name="theta",
+            initializer="random_normal",
+            trainable=True,
+            regularizer=tf.keras.regularizers.L1(regularizer_weight),
+        )
+
     """
     x: [batches, bins, elements, features]
     adj: [batches, bins, elements, elements]
     msk: [batches, bins, elements]
     """
+
     def call(self, inputs):
         x, adj, msk = inputs
 
         adj = tf.squeeze(adj)
-        
-        #compute the normalization of the adjacency matrix
+
+        # compute the normalization of the adjacency matrix
         if self.normalize_degrees:
-            #in_degrees = tf.clip_by_value(tf.reduce_sum(tf.abs(adj), axis=-1), 0, 1000)
+            # in_degrees = tf.clip_by_value(tf.reduce_sum(tf.abs(adj), axis=-1), 0, 1000)
             in_degrees = tf.reduce_sum(tf.abs(adj), axis=-1)
 
-            #add epsilon to prevent numerical issues from 1/sqrt(x)
-            norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)*msk
+            # add epsilon to prevent numerical issues from 1/sqrt(x)
+            norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1) * msk
 
-        f_hom = tf.linalg.matmul(x*msk, self.theta)*msk
+        f_hom = tf.linalg.matmul(x * msk, self.theta) * msk
         if self.normalize_degrees:
-            f_hom = tf.linalg.matmul(adj, f_hom*norm)*norm
+            f_hom = tf.linalg.matmul(adj, f_hom * norm) * norm
         else:
             f_hom = tf.linalg.matmul(adj, f_hom)
 
-        f_het = tf.linalg.matmul(x*msk, self.W_h)
+        f_het = tf.linalg.matmul(x * msk, self.W_h)
         gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
 
-        out = gate*f_hom + (1.0-gate)*f_het
-        return self.activation(out)*msk
+        out = gate * f_hom + (1.0 - gate) * f_het
+        return self.activation(out) * msk
+
 
 class NodeMessageLearnable(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
@@ -232,27 +274,30 @@ def __init__(self, *args, **kwargs):
             self.hidden_dim,
             num_layers=self.num_layers,
             activation=self.activation,
-            name=kwargs.get("name")+"_ffn"
+            name=kwargs.get("name") + "_ffn",
         )
         super(NodeMessageLearnable, self).__init__(*args, **kwargs)
 
     def call(self, inputs):
         x, adj, msk = inputs
 
-        #collect incoming messages (batch, bins, elems, elems, msg_dim) -> (batch, bins, elems, msg_dim)
+        # collect incoming messages (batch, bins, elems, elems, msg_dim) -> (batch, bins, elems, msg_dim)
         max_message_dst = tf.reduce_max(adj, axis=-2)
 
-        #collect outgoing messages (batch, bins, elems, elems, msg_dim) -> (batch, bins, elems, msg_dim)
+        # collect outgoing messages (batch, bins, elems, elems, msg_dim) -> (batch, bins, elems, msg_dim)
         max_message_src = tf.reduce_max(adj, axis=-3)
 
-        #node update (batch, bins, elems, elems, elem_dim + msg_dim + msg_dim)
+        # node update (batch, bins, elems, elems, elem_dim + msg_dim + msg_dim)
         x2 = tf.concat([x, max_message_dst, max_message_src], axis=-1)
         return tf.cast(self.activation(self.ffn(x2)), x.dtype)
 
-def point_wise_feed_forward_network(d_model, dff, name, num_layers=1, activation='elu', dtype=tf.dtypes.float32, dim_decrease=False, dropout=0.0):
+
+def point_wise_feed_forward_network(
+    d_model, dff, name, num_layers=1, activation="elu", dtype=tf.dtypes.float32, dim_decrease=False, dropout=0.0
+):
 
     if regularizer_weight > 0:
-        bias_regularizer =  tf.keras.regularizers.L1(regularizer_weight)
+        bias_regularizer = tf.keras.regularizers.L1(regularizer_weight)
         kernel_regularizer = tf.keras.regularizers.L1(regularizer_weight)
     else:
         bias_regularizer = None
@@ -262,30 +307,35 @@ def point_wise_feed_forward_network(d_model, dff, name, num_layers=1, activation
     for ilayer in range(num_layers):
         _name = name + "_dense_{}".format(ilayer)
 
-        layers.append(tf.keras.layers.Dense(
-            dff, activation=activation, bias_regularizer=bias_regularizer,
-            kernel_regularizer=kernel_regularizer, name=_name))
+        layers.append(
+            tf.keras.layers.Dense(
+                dff,
+                activation=activation,
+                bias_regularizer=bias_regularizer,
+                kernel_regularizer=kernel_regularizer,
+                name=_name,
+            )
+        )
 
-        if dropout>0.0:
+        if dropout > 0.0:
             layers.append(tf.keras.layers.Dropout(dropout))
 
         if dim_decrease:
             dff = dff // 2
 
-    layers.append(tf.keras.layers.Dense(d_model, dtype=dtype, name="{}_dense_{}".format(name, ilayer+1)))
+    layers.append(tf.keras.layers.Dense(d_model, dtype=dtype, name="{}_dense_{}".format(name, ilayer + 1)))
     return tf.keras.Sequential(layers, name=name)
 
+
 def get_message_layer(config_dict, name):
     config_dict = config_dict.copy()
     class_name = config_dict.pop("type")
-    classes = {
-        "NodeMessageLearnable": NodeMessageLearnable,
-        "GHConvDense": GHConvDense
-    }
+    classes = {"NodeMessageLearnable": NodeMessageLearnable, "GHConvDense": GHConvDense}
     conv_cls = classes[class_name]
 
     return conv_cls(name=name, **config_dict)
 
+
 class NodePairGaussianKernel(tf.keras.layers.Layer):
     def __init__(self, **kwargs):
         self.clip_value_low = kwargs.pop("clip_value_low", 0.0)
@@ -305,13 +355,15 @@ def __init__(self, **kwargs):
 
     returns: (n_batch, n_bins, n_points, n_points, 1) message matrix
     """
+
     def call(self, x_msg_binned, msk, training=False):
-        x = x_msg_binned*msk
+        x = x_msg_binned * msk
         dm = tf.expand_dims(self.dist_norm(x, x), axis=-1)
-        dm = tf.exp(-self.dist_mult*dm)
+        dm = tf.exp(-self.dist_mult * dm)
         dm = tf.clip_by_value(dm, self.clip_value_low, 1)
         return dm
 
+
 class NodePairTrainableKernel(tf.keras.layers.Layer):
     def __init__(self, output_dim=4, hidden_dim_node=128, hidden_dim_pair=32, num_layers=1, activation="elu", **kwargs):
         self.output_dim = output_dim
@@ -325,7 +377,7 @@ def __init__(self, output_dim=4, hidden_dim_node=128, hidden_dim_pair=32, num_la
             self.hidden_dim_node,
             kwargs.get("name") + "_" + "node",
             num_layers=self.num_layers,
-            activation=self.activation
+            activation=self.activation,
         )
 
         self.pair_kernel = point_wise_feed_forward_network(
@@ -333,7 +385,7 @@ def __init__(self, output_dim=4, hidden_dim_node=128, hidden_dim_pair=32, num_la
             self.hidden_dim_pair,
             kwargs.get("name") + "_" + "pair_kernel",
             num_layers=self.num_layers,
-            activation=self.activation
+            activation=self.activation,
         )
 
         super(NodePairTrainableKernel, self).__init__(**kwargs)
@@ -343,6 +395,7 @@ def __init__(self, output_dim=4, hidden_dim_node=128, hidden_dim_pair=32, num_la
 
     returns: (n_batch, n_bins, n_points, n_points, output_dim) message matrix
     """
+
     def call(self, x_msg_binned, msk, training=False):
 
         node_proj = self.activation(self.ffn_node(x_msg_binned))
@@ -350,17 +403,16 @@ def call(self, x_msg_binned, msk, training=False):
         dm = tf.cast(pairwise_learnable_dist(node_proj, node_proj, self.pair_kernel, training=training), x_msg_binned.dtype)
         return dm
 
+
 def build_kernel_from_conf(kernel_dict, name):
     kernel_dict = kernel_dict.copy()
 
     cls_type = kernel_dict.pop("type")
-    clss = {
-        "NodePairGaussianKernel": NodePairGaussianKernel,
-        "NodePairTrainableKernel": NodePairTrainableKernel
-    }
+    clss = {"NodePairGaussianKernel": NodePairGaussianKernel, "NodePairTrainableKernel": NodePairTrainableKernel}
 
     return clss[cls_type](name=name, **kernel_dict)
 
+
 class MessageBuildingLayerLSH(tf.keras.layers.Layer):
     def __init__(self, distance_dim=128, max_num_bins=200, bin_size=128, kernel=NodePairGaussianKernel(), **kwargs):
         self.distance_dim = distance_dim
@@ -371,43 +423,44 @@ def __init__(self, distance_dim=128, max_num_bins=200, bin_size=128, kernel=Node
         super(MessageBuildingLayerLSH, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        #(n_batch, n_points, n_features)
-    
-        #generate the LSH codebook for random rotations (num_features, max_num_bins/2)
+        # (n_batch, n_points, n_features)
+
+        # generate the LSH codebook for random rotations (num_features, max_num_bins/2)
         self.codebook_random_rotations = self.add_weight(
-            shape=(self.distance_dim, self.max_num_bins//2), initializer="random_normal",
-            trainable=False, name="lsh_projections"
+            shape=(self.distance_dim, self.max_num_bins // 2),
+            initializer="random_normal",
+            trainable=False,
+            name="lsh_projections",
         )
-    
+
     """
     x_msg: (n_batch, n_points, n_msg_features)
     x_node: (n_batch, n_points, n_node_features)
     """
+
     def call(self, x_msg, x_node, msk, training=False):
         msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
 
         shp = tf.shape(x_msg)
-        n_batches = shp[0]
         n_points = shp[1]
-        n_message_features = shp[2]
 
-        #compute the number of LSH bins to divide the input points into on the fly
-        #n_points must be divisible by bin_size exactly due to the use of reshape
+        # compute the number of LSH bins to divide the input points into on the fly
+        # n_points must be divisible by bin_size exactly due to the use of reshape
         n_bins = tf.math.floordiv(n_points, self.bin_size)
 
-        #put each input item into a bin defined by the argmax output across the LSH embedding
-        #FIXME: this needs n_bins to be at least 2 to work correctly!
-        mul = tf.linalg.matmul(x_msg, self.codebook_random_rotations[:, :n_bins//2])
+        # put each input item into a bin defined by the argmax output across the LSH embedding
+        # FIXME: this needs n_bins to be at least 2 to work correctly!
+        mul = tf.linalg.matmul(x_msg, self.codebook_random_rotations[:, : n_bins // 2])
         cmul = tf.concat([mul, -mul], axis=-1)
         bins_split = split_indices_to_bins_batch(cmul, n_bins, self.bin_size, msk)
         x_msg_binned = tf.gather(x_msg, bins_split, batch_dims=1)
         x_features_binned = tf.gather(x_node, bins_split, batch_dims=1)
         msk_f_binned = tf.gather(msk_f, bins_split, batch_dims=1)
 
-        #Run the node-to-node kernel (distance computation / graph building / attention)
+        # Run the node-to-node kernel (distance computation / graph building / attention)
         dm = self.kernel(x_msg_binned, msk_f_binned, training=training)
 
-        #remove the masked points row-wise and column-wise
+        # remove the masked points row-wise and column-wise
         msk_f_binned_squeeze = tf.squeeze(msk_f_binned, axis=-1)
         shp_dm = tf.shape(dm)
         rshp_row = [shp_dm[0], shp_dm[1], shp_dm[2], 1, 1]
@@ -419,73 +472,64 @@ def call(self, x_msg, x_node, msk, training=False):
 
         return bins_split, x_features_binned, dm, msk_f_binned
 
+
 class MessageBuildingLayerFull(tf.keras.layers.Layer):
     def __init__(self, distance_dim=128, kernel=NodePairGaussianKernel(), **kwargs):
         self.distance_dim = distance_dim
         self.kernel = kernel
 
         super(MessageBuildingLayerFull, self).__init__(**kwargs)
-    
+
     """
     x_msg: (n_batch, n_points, n_msg_features)
     """
+
     def call(self, x_msg, msk, training=False):
         msk_f = tf.expand_dims(tf.cast(msk, x_msg.dtype), -1)
 
-        shp = tf.shape(x_msg)
-        n_batches = shp[0]
-        n_points = shp[1]
-        n_message_features = shp[2]
-
-        #Run the node-to-node kernel (distance computation / graph building / attention)
+        # Run the node-to-node kernel (distance computation / graph building / attention)
         dm = self.kernel(x_msg, training=training)
 
-        #remove the masked points row-wise and column-wise
+        # remove the masked points row-wise and column-wise
         dm = tf.einsum("bijk,bi->bijk", dm, tf.squeeze(msk_f, axis=-1))
         dm = tf.einsum("bijk,bj->bijk", dm, tf.squeeze(msk_f, axis=-1))
 
         return dm
 
+
 class OutputDecoding(tf.keras.Model):
-    def __init__(self,
+    def __init__(
+        self,
         activation="elu",
         regression_use_classification=True,
         num_output_classes=8,
         schema="cms",
         dropout=0.0,
-
-        pt_skip_gate=True,
-        eta_skip_gate=True,
-        phi_skip_gate=True,
         energy_skip_gate=True,
-
         id_dim_decrease=True,
         charge_dim_decrease=True,
         pt_dim_decrease=False,
         eta_dim_decrease=False,
         phi_dim_decrease=False,
         energy_dim_decrease=False,
-
         id_hidden_dim=128,
         charge_hidden_dim=128,
         pt_hidden_dim=128,
         eta_hidden_dim=128,
         phi_hidden_dim=128,
         energy_hidden_dim=128,
-
         id_num_layers=4,
         charge_num_layers=2,
         pt_num_layers=3,
         eta_num_layers=3,
         phi_num_layers=3,
         energy_num_layers=3,
-
         layernorm=False,
         mask_reg_cls0=True,
         energy_multimodal=True,
-
         event_set_output=False,
-        **kwargs):
+        **kwargs
+    ):
 
         super(OutputDecoding, self).__init__(**kwargs)
 
@@ -493,10 +537,6 @@ def __init__(self,
         self.schema = schema
         self.dropout = dropout
 
-        self.pt_skip_gate = pt_skip_gate
-        self.eta_skip_gate = eta_skip_gate
-        self.phi_skip_gate = phi_skip_gate
-
         self.mask_reg_cls0 = mask_reg_cls0
 
         self.energy_multimodal = energy_multimodal
@@ -508,58 +548,71 @@ def __init__(self,
         self.event_set_output = event_set_output
 
         self.ffn_id = point_wise_feed_forward_network(
-            num_output_classes, id_hidden_dim,
+            num_output_classes,
+            id_hidden_dim,
             "ffn_cls",
             num_layers=id_num_layers,
             activation=activation,
             dim_decrease=id_dim_decrease,
-            dropout=dropout
+            dropout=dropout,
         )
         self.ffn_charge = point_wise_feed_forward_network(
-            1, charge_hidden_dim,
+            1,
+            charge_hidden_dim,
             "ffn_charge",
             num_layers=charge_num_layers,
             activation=activation,
             dim_decrease=charge_dim_decrease,
-            dropout=dropout
+            dropout=dropout,
         )
-        
+
         self.ffn_pt = point_wise_feed_forward_network(
-            2, pt_hidden_dim, "ffn_pt",
+            2,
+            pt_hidden_dim,
+            "ffn_pt",
             num_layers=pt_num_layers,
             activation=activation,
             dim_decrease=pt_dim_decrease,
-            dropout=dropout
+            dropout=dropout,
         )
 
         self.ffn_eta = point_wise_feed_forward_network(
-            2, eta_hidden_dim, "ffn_eta",
+            1,
+            eta_hidden_dim,
+            "ffn_eta",
             num_layers=eta_num_layers,
             activation=activation,
             dim_decrease=eta_dim_decrease,
-            dropout=dropout
+            dropout=dropout,
         )
 
+        # sin_phi, cos_phi outputs
         self.ffn_phi = point_wise_feed_forward_network(
-            4, phi_hidden_dim, "ffn_phi",
+            2,
+            phi_hidden_dim,
+            "ffn_phi",
             num_layers=phi_num_layers,
             activation=activation,
             dim_decrease=phi_dim_decrease,
-            dropout=dropout
+            dropout=dropout,
         )
 
         self.ffn_energy = point_wise_feed_forward_network(
-            num_output_classes if self.energy_multimodal else 1, energy_hidden_dim, "ffn_energy",
+            num_output_classes if self.energy_multimodal else 1,
+            energy_hidden_dim,
+            "ffn_energy",
             num_layers=energy_num_layers,
             activation=activation,
             dim_decrease=energy_dim_decrease,
-            dropout=dropout)
+            dropout=dropout,
+        )
 
     """
     X_input: (n_batch, n_elements, n_input_features) raw node input features
     X_encoded: (n_batch, n_elements, n_encoded_features) encoded/transformed node features
     msk_input: (n_batch, n_elements) boolean mask of active nodes
     """
+
     def call(self, args, training=False):
 
         X_input, X_encoded, X_encoded_energy, msk_input = args
@@ -572,104 +625,94 @@ def call(self, args, training=False):
         msk_input_outtype = tf.cast(msk_input, out_id_logits.dtype)
 
         out_id_softmax = tf.nn.softmax(out_id_logits, axis=-1)
-        out_id_hard_softmax = tf.stop_gradient(tf.nn.softmax(100*out_id_logits, axis=-1))
+        out_id_hard_softmax = tf.stop_gradient(tf.nn.softmax(100 * out_id_logits, axis=-1))
 
         out_charge = self.ffn_charge(X_encoded, training=training)
         out_charge = out_charge * msk_input_outtype
 
         orig_eta = tf.cast(X_input[:, :, 2:3], out_id_logits.dtype)
 
-        #FIXME: better schema propagation between hep_tfds
-        #skip connection from raw input values
+        # FIXME: better schema propagation between hep_tfds
+        # skip connection from raw input values
         if self.schema == "cms":
-            orig_sin_phi = tf.cast(tf.math.sin(X_input[:, :, 3:4])*msk_input, out_id_logits.dtype)
-            orig_cos_phi = tf.cast(tf.math.cos(X_input[:, :, 3:4])*msk_input, out_id_logits.dtype)
-            orig_energy = tf.cast(X_input[:, :, 4:5]*msk_input, out_id_logits.dtype)
+            orig_sin_phi = tf.cast(tf.math.sin(X_input[:, :, 3:4]) * msk_input, out_id_logits.dtype)
+            orig_cos_phi = tf.cast(tf.math.cos(X_input[:, :, 3:4]) * msk_input, out_id_logits.dtype)
+            orig_energy = tf.cast(X_input[:, :, 4:5] * msk_input, out_id_logits.dtype)
         elif self.schema == "delphes":
-            orig_sin_phi = tf.cast(X_input[:, :, 3:4]*msk_input, out_id_logits.dtype)
-            orig_cos_phi = tf.cast(X_input[:, :, 4:5]*msk_input, out_id_logits.dtype)
-            orig_energy = tf.cast(X_input[:, :, 5:6]*msk_input, out_id_logits.dtype)
+            orig_sin_phi = tf.cast(X_input[:, :, 3:4] * msk_input, out_id_logits.dtype)
+            orig_cos_phi = tf.cast(X_input[:, :, 4:5] * msk_input, out_id_logits.dtype)
+            orig_energy = tf.cast(X_input[:, :, 5:6] * msk_input, out_id_logits.dtype)
 
         if self.regression_use_classification:
             X_encoded = tf.concat([X_encoded, tf.cast(tf.stop_gradient(out_id_logits), X_encoded.dtype)], axis=-1)
 
         pred_eta_corr = self.ffn_eta(X_encoded, training=training)
-        pred_eta_corr = pred_eta_corr*msk_input_outtype
+        pred_eta_corr = pred_eta_corr * msk_input_outtype
         pred_phi_corr = self.ffn_phi(X_encoded, training=training)
-        pred_phi_corr = pred_phi_corr*msk_input_outtype
+        pred_phi_corr = pred_phi_corr * msk_input_outtype
 
-        if self.eta_skip_gate:
-            eta_gate = tf.keras.activations.sigmoid(pred_eta_corr[:, :, 0:1])
-            pred_eta = orig_eta + pred_eta_corr[:, :, 1:2]
-        else:
-            pred_eta = orig_eta*pred_eta_corr[:, :, 0:1] + pred_eta_corr[:, :, 1:2]
-        
-        if self.phi_skip_gate:
-            sin_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 0:1])
-            cos_phi_gate = tf.keras.activations.sigmoid(pred_phi_corr[:, :, 2:3])
-            pred_sin_phi = orig_sin_phi + pred_phi_corr[:, :, 1:2]
-            pred_cos_phi = orig_cos_phi + pred_phi_corr[:, :, 3:4]
-        else:
-            pred_sin_phi = orig_sin_phi*pred_phi_corr[:, :, 0:1] + pred_phi_corr[:, :, 1:2]
-            pred_cos_phi = orig_cos_phi*pred_phi_corr[:, :, 2:3] + pred_phi_corr[:, :, 3:4]
+        pred_eta = orig_eta + pred_eta_corr[:, :, 0:1]
+        pred_sin_phi = orig_sin_phi + pred_phi_corr[:, :, 0:1]
+        pred_cos_phi = orig_cos_phi + pred_phi_corr[:, :, 1:2]
 
         X_encoded_energy = tf.concat([X_encoded, X_encoded_energy], axis=-1)
         if self.regression_use_classification:
-            X_encoded_energy = tf.concat([X_encoded_energy, tf.cast(tf.stop_gradient(out_id_logits), X_encoded.dtype)], axis=-1)
+            X_encoded_energy = tf.concat(
+                [X_encoded_energy, tf.cast(tf.stop_gradient(out_id_logits), X_encoded.dtype)], axis=-1
+            )
 
         pred_energy_corr = self.ffn_energy(X_encoded_energy, training=training)
-        pred_energy_corr = pred_energy_corr*msk_input_outtype
+        pred_energy_corr = pred_energy_corr * msk_input_outtype
 
-        #In case of a multimodal prediction, weight the per-class energy predictions by the approximately one-hot vector
+        # In case of a multimodal prediction, weight the per-class energy predictions by the approximately one-hot vector
         if self.energy_multimodal:
-            pred_energy = orig_energy+tf.reduce_sum(out_id_hard_softmax*pred_energy_corr, axis=-1, keepdims=True)
+            pred_energy = orig_energy + tf.reduce_sum(out_id_hard_softmax * pred_energy_corr, axis=-1, keepdims=True)
         else:
-            pred_energy = orig_energy+pred_energy_corr
+            pred_energy = orig_energy + pred_energy_corr
         pred_energy = tf.abs(pred_energy)
 
-        #compute pt=E/cosh(eta)
+        # compute pt=E/cosh(eta)
+        # FIXME: check if this is actually useful
         orig_pt = tf.stop_gradient(pred_energy / tf.math.cosh(tf.clip_by_value(pred_eta, -8, 8)))
 
         pred_pt_corr = self.ffn_pt(X_encoded_energy, training=training)
-        pred_pt_corr = pred_pt_corr*msk_input_outtype
+        pred_pt_corr = pred_pt_corr * msk_input_outtype
+        pred_pt = orig_pt * pred_pt_corr[:, :, 0:1] + pred_pt_corr[:, :, 1:2]
 
-        if self.pt_skip_gate:
-            pt_gate = tf.keras.activations.sigmoid(pred_pt_corr[:, :, 0:1])
-            pred_pt = orig_pt + pt_gate*pred_pt_corr[:, :, 1:2]
-        else:
-            pred_pt = orig_pt*pred_pt_corr[:, :, 0:1] + pred_pt_corr[:, :, 1:2]
-        
         pred_pt = tf.abs(pred_pt)
 
-        #mask the regression outputs for the nodes with a class prediction 0
-        msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_hard_softmax, axis=-1)!=0, tf.float32), axis=-1)
+        # mask the regression outputs for the nodes with a class prediction 0
+        msk_output = tf.expand_dims(tf.cast(tf.argmax(out_id_hard_softmax, axis=-1) != 0, tf.float32), axis=-1)
 
         if self.mask_reg_cls0:
-            out_charge = out_charge*msk_output
-            pred_pt = pred_pt*msk_output
-            pred_eta = pred_eta*msk_output
-            pred_sin_phi = pred_sin_phi*msk_output
-            pred_cos_phi = pred_cos_phi*msk_output
-            pred_energy = pred_energy*msk_output
+            out_charge = out_charge * msk_output
+            pred_pt = pred_pt * msk_output
+            pred_eta = pred_eta * msk_output
+            pred_sin_phi = pred_sin_phi * msk_output
+            pred_cos_phi = pred_cos_phi * msk_output
+            pred_energy = pred_energy * msk_output
 
         ret = {
             "cls": out_id_softmax,
-            "charge": out_charge*msk_input_outtype,
-            "pt": pred_pt*msk_input_outtype,
-            "eta": pred_eta*msk_input_outtype,
-            "sin_phi": pred_sin_phi*msk_input_outtype,
-            "cos_phi": pred_cos_phi*msk_input_outtype,
-            "energy": pred_energy*msk_input_outtype,
+            "charge": out_charge * msk_input_outtype,
+            "pt": pred_pt * msk_input_outtype,
+            "eta": pred_eta * msk_input_outtype,
+            "sin_phi": pred_sin_phi * msk_input_outtype,
+            "cos_phi": pred_cos_phi * msk_input_outtype,
+            "energy": pred_energy * msk_input_outtype,
         }
 
         if self.event_set_output:
-            pt_e_eta_phi = tf.concat([
-                pred_pt*msk_input_outtype,
-                pred_energy*msk_input_outtype,
-                pred_eta*msk_input_outtype,
-                pred_sin_phi*msk_input_outtype,
-                pred_cos_phi*msk_input_outtype
-                ], axis=-1)
+            pt_e_eta_phi = tf.concat(
+                [
+                    pred_pt * msk_input_outtype,
+                    pred_energy * msk_input_outtype,
+                    pred_eta * msk_input_outtype,
+                    pred_sin_phi * msk_input_outtype,
+                    pred_cos_phi * msk_input_outtype,
+                ],
+                axis=-1,
+            )
             ret["pt_e_eta_phi"] = pt_e_eta_phi
 
         return ret
@@ -690,9 +733,10 @@ def set_trainable_classification(self):
         self.ffn_pt.trainable = False
         self.ffn_energy.trainable = False
 
+
 class CombinedGraphLayer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
-    
+
         self.max_num_bins = kwargs.pop("max_num_bins")
         self.bin_size = kwargs.pop("bin_size")
         self.distance_dim = kwargs.pop("distance_dim")
@@ -708,31 +752,34 @@ def __init__(self, *args, **kwargs):
         self.dist_activation = getattr(tf.keras.activations, kwargs.pop("dist_activation", "linear"))
 
         if self.do_layernorm:
-            self.layernorm1 = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-6, name=kwargs.get("name")+"_layernorm1")
+            self.layernorm1 = tf.keras.layers.LayerNormalization(
+                axis=-1, epsilon=1e-6, name=kwargs.get("name") + "_layernorm1"
+            )
 
-        #self.gaussian_noise = tf.keras.layers.GaussianNoise(0.01)
+        # self.gaussian_noise = tf.keras.layers.GaussianNoise(0.01)
         self.ffn_dist = point_wise_feed_forward_network(
             self.distance_dim,
             self.ffn_dist_hidden_dim,
             kwargs.get("name") + "_ffn_dist",
-            num_layers=self.ffn_dist_num_layers, activation=self.activation,
-            dropout=self.dropout
+            num_layers=self.ffn_dist_num_layers,
+            activation=self.activation,
+            dropout=self.dropout,
         )
         if self.do_lsh:
             self.message_building_layer = MessageBuildingLayerLSH(
                 distance_dim=self.distance_dim,
                 max_num_bins=self.max_num_bins,
                 bin_size=self.bin_size,
-                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name") + "_kernel"),
             )
         else:
             self.message_building_layer = MessageBuildingLayerFull(
-                distance_dim=self.distance_dim,
-                kernel=build_kernel_from_conf(self.kernel, kwargs.get("name")+"_kernel")
+                distance_dim=self.distance_dim, kernel=build_kernel_from_conf(self.kernel, kwargs.get("name") + "_kernel")
             )
 
         self.message_passing_layers = [
-            get_message_layer(self.node_message, "{}_msg_{}".format(kwargs.get("name"), iconv)) for iconv in range(self.num_node_messages)
+            get_message_layer(self.node_message, "{}_msg_{}".format(kwargs.get("name"), iconv))
+            for iconv in range(self.num_node_messages)
         ]
         self.dropout_layer = None
         if self.dropout:
@@ -745,64 +792,66 @@ def call(self, x, msk, training=False):
         if self.do_layernorm:
             x = self.layernorm1(x, training=training)
 
-        #compute node features for graph building
+        # compute node features for graph building
         x_dist = self.dist_activation(self.ffn_dist(x, training=training))
 
-        #compute the element-to-element messages / distance matrix / graph structure
+        # compute the element-to-element messages / distance matrix / graph structure
         if self.do_lsh:
             bins_split, x, dm, msk_f = self.message_building_layer(x_dist, x, msk)
-            #bins_split: (FIXME)
-            #x: (batch, bin, elem, node_feature)
-            #dm: (batch, bin, elem, elem, pair_feature)
-            #msk_f: (batch, bin, elem, elem, 1)
+            # bins_split: (FIXME)
+            # x: (batch, bin, elem, node_feature)
+            # dm: (batch, bin, elem, elem, pair_feature)
+            # msk_f: (batch, bin, elem, elem, 1)
         else:
             dm = self.message_building_layer(x_dist, msk)
             msk_f = tf.expand_dims(tf.cast(msk, x.dtype), axis=-1)
             bins_split = None
-            #dm: (batch, elem, elem, pair_feature)
+            # dm: (batch, elem, elem, pair_feature)
 
-        #run the node update with message passing
+        # run the node update with message passing
         for msg in self.message_passing_layers:
             x = msg((x, dm, msk_f))
             if self.dropout_layer:
                 x = self.dropout_layer(x, training=training)
 
-        #undo the binning according to the element-to-bin indices
+        # undo the binning according to the element-to-bin indices
         if self.do_lsh:
             x = reverse_lsh(bins_split, x)
 
         return {"enc": x, "dist": x_dist, "bins": bins_split, "dm": dm}
 
+
 class PFNetDense(tf.keras.Model):
-    def __init__(self,
-            do_node_encoding=False,
-            node_encoding_hidden_dim=128,
-            dropout=0.0,
-            activation="gelu",
-            multi_output=False,
-            num_input_classes=8,
-            num_output_classes=3,
-            num_graph_layers_id=1,
-            num_graph_layers_reg=1,
-            input_encoding="cms",
-            skip_connection=True,
-            graph_kernel={},
-            combined_graph_layer={},
-            node_message={},
-            output_decoding={},
-            debug=False,
-            schema="cms",
-            node_update_mode="concat",
-            event_set_output=False,
-            **kwargs
-        ):
+    def __init__(
+        self,
+        do_node_encoding=False,
+        node_encoding_hidden_dim=128,
+        dropout=0.0,
+        activation="gelu",
+        multi_output=False,
+        num_input_classes=8,
+        num_output_classes=3,
+        num_graph_layers_id=1,
+        num_graph_layers_reg=1,
+        input_encoding="cms",
+        skip_connection=True,
+        graph_kernel={},
+        combined_graph_layer={},
+        node_message={},
+        output_decoding={},
+        debug=False,
+        schema="cms",
+        node_update_mode="concat",
+        event_set_output=False,
+        **kwargs
+    ):
         super(PFNetDense, self).__init__()
 
         self.multi_output = multi_output
         self.debug = debug
 
         self.skip_connection = skip_connection
-        
+
         self.do_node_encoding = do_node_encoding
         self.node_encoding_hidden_dim = node_encoding_hidden_dim
         self.dropout = dropout
@@ -816,7 +865,7 @@ def __init__(self,
                 "node_encoding",
                 num_layers=1,
                 activation=self.activation,
-                dropout=self.dropout
+                dropout=self.dropout,
             )
 
         if input_encoding == "cms":
@@ -824,8 +873,12 @@ def __init__(self,
         elif input_encoding == "default":
             self.enc = InputEncoding(num_input_classes)
 
-        self.cg_id = [CombinedGraphLayer(name="cg_id_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_id)]
-        self.cg_reg = [CombinedGraphLayer(name="cg_reg_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_reg)]
+        self.cg_id = [
+            CombinedGraphLayer(name="cg_id_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_id)
+        ]
+        self.cg_reg = [
+            CombinedGraphLayer(name="cg_reg_{}".format(i), **combined_graph_layer) for i in range(num_graph_layers_reg)
+        ]
 
         output_decoding["schema"] = schema
         output_decoding["num_output_classes"] = num_output_classes
@@ -836,10 +889,10 @@ def call(self, inputs, training=False):
         X = inputs
         debugging_data = {}
 
-        #encode the elements for classification (id)
+        # encode the elements for classification (id)
         X_enc = self.enc(X)
 
-        #mask padded elements
+        # mask padded elements
         msk = X[:, :, 0] != 0
         msk_input = tf.expand_dims(tf.cast(msk, X_enc.dtype), -1)
 
@@ -863,9 +916,9 @@ def call(self, inputs, training=False):
 
             if self.debug:
                 debugging_data[cg.name] = enc_all
-        
+
         if self.node_update_mode == "concat":
-            dec_output_id = tf.concat(encs_id, axis=-1)*msk_input
+            dec_output_id = tf.concat(encs_id, axis=-1) * msk_input
         elif self.node_update_mode == "additive":
             dec_output_id = X_enc_cg
 
@@ -890,7 +943,7 @@ def call(self, inputs, training=False):
             encs_reg.append(X_enc_cg)
 
         if self.node_update_mode == "concat":
-            dec_output_reg = tf.concat(encs_reg, axis=-1)*msk_input
+            dec_output_reg = tf.concat(encs_reg, axis=-1) * msk_input
         elif self.node_update_mode == "additive":
             dec_output_reg = X_enc_cg
 
@@ -907,7 +960,9 @@ def call(self, inputs, training=False):
         if self.multi_output:
             return ret
         else:
-            return tf.concat([ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1)
+            return tf.concat(
+                [ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1
+            )
 
     def set_trainable_named(self, layer_names):
         self.trainable = True
@@ -961,14 +1016,20 @@ def set_trainable_named(self, layer_names):
     #     self.step += 1
     #     return {m.name: m.result() for m in self.metrics}
 
+
 class KernelEncoder(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         from official.nlp.modeling.layers.kernel_attention import KernelAttention
+
         self.key_dim = kwargs.pop("key_dim")
         num_heads = 2
 
-        self.attn = KernelAttention(feature_transform="elu", num_heads=num_heads, key_dim=self.key_dim, name=kwargs.get("name") + "_attention")
-        self.ffn = point_wise_feed_forward_network(self.key_dim, self.key_dim, kwargs.get("name") + "_ffn", num_layers=1, activation="elu")
+        self.attn = KernelAttention(
+            feature_transform="elu", num_heads=num_heads, key_dim=self.key_dim, name=kwargs.get("name") + "_attention"
+        )
+        self.ffn = point_wise_feed_forward_network(
+            self.key_dim, self.key_dim, kwargs.get("name") + "_ffn", num_layers=1, activation="elu"
+        )
         self.norm1 = tf.keras.layers.LayerNormalization(axis=-1, name=kwargs.get("name") + "_ln0")
         self.norm2 = tf.keras.layers.LayerNormalization(axis=-1, name=kwargs.get("name") + "_ln1")
         super(KernelEncoder, self).__init__(*args, **kwargs)
@@ -977,7 +1038,7 @@ def call(self, args, training=False):
         X, mask = args
         msk_input = tf.expand_dims(tf.cast(mask, tf.float32), -1)
 
-        attn_output = self.attn(query=X, value=X, key=X, training=training, attention_mask=mask)*msk_input
+        attn_output = self.attn(query=X, value=X, key=X, training=training, attention_mask=mask) * msk_input
         out1 = self.norm1(X + attn_output)
 
         ffn_output = self.ffn(out1)
@@ -985,16 +1046,24 @@ def call(self, args, training=False):
 
         return out2
 
+
 class KernelDecoder(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         from official.nlp.modeling.layers.kernel_attention import KernelAttention
+
         self.key_dim = kwargs.pop("key_dim")
         num_heads = 2
 
-        self.attn1 = KernelAttention(feature_transform="elu", num_heads=num_heads, key_dim=self.key_dim, name=kwargs.get("name") + "_attention1")
-        self.attn2 = KernelAttention(feature_transform="elu", num_heads=num_heads, key_dim=self.key_dim, name=kwargs.get("name") + "_attention2")
+        self.attn1 = KernelAttention(
+            feature_transform="elu", num_heads=num_heads, key_dim=self.key_dim, name=kwargs.get("name") + "_attention1"
+        )
+        self.attn2 = KernelAttention(
+            feature_transform="elu", num_heads=num_heads, key_dim=self.key_dim, name=kwargs.get("name") + "_attention2"
+        )
 
-        self.ffn = point_wise_feed_forward_network(self.key_dim, self.key_dim, kwargs.get("name") + "_ffn", num_layers=1, activation="elu")
+        self.ffn = point_wise_feed_forward_network(
+            self.key_dim, self.key_dim, kwargs.get("name") + "_ffn", num_layers=1, activation="elu"
+        )
 
         self.norm1 = tf.keras.layers.LayerNormalization(axis=-1, name=kwargs.get("name") + "_ln0")
         self.norm2 = tf.keras.layers.LayerNormalization(axis=-1, name=kwargs.get("name") + "_ln1")
@@ -1005,10 +1074,10 @@ def call(self, args, training=False):
         X, enc_output, mask = args
         msk_input = tf.expand_dims(tf.cast(mask, tf.float32), -1)
 
-        attn1 = self.attn1(query=X, value=X, key=X, training=training, attention_mask=mask)*msk_input
+        attn1 = self.attn1(query=X, value=X, key=X, training=training, attention_mask=mask) * msk_input
         out1 = self.norm1(attn1 + X, training=training)
 
-        attn2 = self.attn2(query=enc_output, value=enc_output, key=out1, training=training, attention_mask=mask)*msk_input
+        attn2 = self.attn2(query=enc_output, value=enc_output, key=out1, training=training, attention_mask=mask) * msk_input
         out2 = self.norm2(attn2 + out1)
 
         ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
@@ -1016,6 +1085,7 @@ def call(self, args, training=False):
 
         return out3
 
+
 class Transformer(tf.keras.layers.Layer):
     def __init__(self, *args, **kwargs):
         self.encoders = []
@@ -1036,17 +1106,18 @@ def call(self, inputs, training=False):
         msk_input = tf.expand_dims(tf.cast(mask, tf.float32), -1)
 
         for enc in self.encoders:
-            X = enc([X, mask], training=training)*msk_input
+            X = enc([X, mask], training=training) * msk_input
 
         X_dec = X
         for dec in self.decoders:
-            X_dec = dec([X_dec, X, mask], training=training)*msk_input
+            X_dec = dec([X_dec, X, mask], training=training) * msk_input
 
         return X_dec
 
 
 class PFNetTransformer(tf.keras.Model):
-    def __init__(self,
+    def __init__(
+        self,
         num_input_classes=8,
         num_output_classes=3,
         input_encoding="cms",
@@ -1054,7 +1125,7 @@ def __init__(self,
         output_decoding={},
         multi_output=True,
         event_set_output=False,
-        ):
+    ):
         super(PFNetTransformer, self).__init__()
 
         self.multi_output = multi_output
@@ -1077,9 +1148,8 @@ def __init__(self,
 
     def call(self, inputs, training=False):
         X = inputs
-        debugging_data = {}
 
-        #mask padded elements
+        # mask padded elements
         msk = tf.cast(X[:, :, 0] != 0, tf.float32)
         msk_input = tf.expand_dims(tf.cast(msk, tf.float32), -1)
 
@@ -1094,4 +1164,6 @@ def call(self, inputs, training=False):
         if self.multi_output:
             return ret
         else:
-            return tf.concat([ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1)
+            return tf.concat(
+                [ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1
+            )
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 75b542a3c..1e5861e30 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -3,99 +3,55 @@
 except ModuleNotFoundError:
     print("hvd not enabled, ignoring")
 
-from .model import PFNetTransformer, PFNetDense
-
-import tensorflow as tf
-import tensorflow_addons as tfa
-import pickle
-import numpy as np
-import os
-import io
-import os
-import yaml
-import uuid
-import matplotlib
-import matplotlib.pyplot as plt
-from argparse import Namespace
-import time
+import glob
 import json
-import random
-import math
-import platform
-import mplhep
-from tqdm import tqdm
+import os
+import pickle
 from pathlib import Path
-import glob
-
-import tf2onnx
-import sklearn
-import sklearn.metrics
 
+import awkward
 import fastjet
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons as tfa
+import tf2onnx
 import vector
-import awkward
-
-from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
+from tensorflow.keras.metrics import Recall
 from tfmodel.callbacks import CustomTensorBoard
-from tfmodel.utils import get_lr_schedule, get_optimizer, make_weight_function, targets_multi_output
 from tfmodel.datasets.BaseDatasetFactory import unpack_target
-import tensorflow_datasets as tfds
-
-
-from tensorflow.keras.metrics import Recall, CategoricalAccuracy
-import keras
-
-def plot_confusion_matrix(cm):
-    fig = plt.figure(figsize=(5,5))
-    plt.imshow(cm, cmap="Blues")
-    plt.xlabel("Predicted PID")
-    plt.ylabel("Target PID")
-    plt.colorbar()
-    plt.tight_layout()
-    return fig
-
-def plot_to_image(figure):
-    """
-    Converts the matplotlib plot specified by 'figure' to a PNG image and
-    returns it. The supplied figure is closed and inaccessible after this call.
-    """
-    
-    buf = io.BytesIO()
-    
-    # Use plt.savefig to save the plot to a PNG in memory.
-    plt.savefig(buf, format='png')
-    plt.close(figure)
-    buf.seek(0)
-    
-    image = tf.image.decode_png(buf.getvalue(), channels=4)
-    image = tf.expand_dims(image, 0)
-    
-    return image
+from tqdm import tqdm
+
+from .model import PFNetDense, PFNetTransformer
+
 
 class ModelOptimizerCheckpoint(tf.keras.callbacks.ModelCheckpoint):
     def on_epoch_end(self, epoch, logs=None):
         super(ModelOptimizerCheckpoint, self).on_epoch_end(epoch, logs=logs)
-        weightfile_path = self.opt_path.format(epoch=epoch+1, **logs)
-        try:        
-            #PCGrad is derived from the legacy optimizer
+        weightfile_path = self.opt_path.format(epoch=epoch + 1, **logs)
+        try:
+            # PCGrad is derived from the legacy optimizer
             if self.model.optimizer.__class__.__module__ == "keras.optimizers.optimizer_v1":
-                #lr = self.model.optimizer.optimizer.optimizer.lr
+                # lr = self.model.optimizer.optimizer.optimizer.lr
                 weights = self.model.optimizer.optimizer.optimizer.get_weights()
             else:
-                #lr = self.model.optimizer.lr
+                # lr = self.model.optimizer.lr
                 weights = self.model.optimizer.get_weights()
 
             with open(weightfile_path, "wb") as fi:
-                pickle.dump({
-                    #"lr": lr,
-                    "weights": weights
-                    }, fi
+                pickle.dump(
+                    {
+                        # "lr": lr,
+                        "weights": weights
+                    },
+                    fi,
                 )
         except Exception as e:
             print("Could not save optimizer state: {}".format(e))
             if os.path.isfile(weightfile_path):
                 os.remove(weightfile_path)
 
+
 class CustomCallback(tf.keras.callbacks.Callback):
     def __init__(self, outpath, dataset, config, plot_freq=1, horovod_enabled=False):
         super(CustomCallback, self).__init__()
@@ -108,115 +64,98 @@ def __init__(self, outpath, dataset, config, plot_freq=1, horovod_enabled=False)
         self.writer = tf.summary.create_file_writer(outpath)
 
     def on_epoch_end(self, epoch, logs=None):
-        if not self.horovod_enabled or hvd.rank()==0:
+        if not self.horovod_enabled or hvd.rank() == 0:
             epoch_end(self, epoch, logs)
 
+
 def epoch_end(self, epoch, logs):
-    #first epoch is 1, not 0
+    # first epoch is 1, not 0
     epoch = epoch + 1
 
-    #save the training logs (losses) for this epoch
+    # save the training logs (losses) for this epoch
     with open("{}/history_{}.json".format(self.outpath, epoch), "w") as fi:
         json.dump(logs, fi)
 
-    if self.plot_freq<=0:
+    if self.plot_freq <= 0:
         return
-        
-    if self.plot_freq>=1:
-        if epoch%self.plot_freq!=0:
+
+    if self.plot_freq >= 1:
+        if epoch % self.plot_freq != 0:
             return
 
         cp_dir = Path(self.outpath) / "epoch_{}".format(epoch)
         cp_dir.mkdir(parents=True, exist_ok=True)
 
-        #run the model inference on the validation dataset
+        # run the model inference on the validation dataset
         eval_model(self.model, self.dataset, self.config, cp_dir)
-        
+
         yvals = {}
-        for fi in glob.glob(str(cp_dir/"*.npz")):
+        for fi in glob.glob(str(cp_dir / "*.npz")):
             dd = np.load(fi)
             keys_in_file = list(dd.keys())
             for k in keys_in_file:
-                if k=="X":
+                if k == "X":
                     continue
                 if not (k in yvals):
                     yvals[k] = []
                 yvals[k].append(dd[k])
         yvals = {k: np.concatenate(v) for k, v in yvals.items()}
 
-        gen_px = yvals["gen_pt"]*yvals["gen_cos_phi"]
-        gen_py = yvals["gen_pt"]*yvals["gen_sin_phi"]
-        pred_px = yvals["pred_pt"]*yvals["pred_cos_phi"]
-        pred_py = yvals["pred_pt"]*yvals["pred_sin_phi"]
-        cand_px = yvals["cand_pt"]*yvals["cand_cos_phi"]
-        cand_py = yvals["cand_pt"]*yvals["cand_sin_phi"]
+        gen_px = yvals["gen_pt"] * yvals["gen_cos_phi"]
+        gen_py = yvals["gen_pt"] * yvals["gen_sin_phi"]
+        pred_px = yvals["pred_pt"] * yvals["pred_cos_phi"]
+        pred_py = yvals["pred_pt"] * yvals["pred_sin_phi"]
+        cand_px = yvals["cand_pt"] * yvals["cand_cos_phi"]
+        cand_py = yvals["cand_pt"] * yvals["cand_sin_phi"]
 
-        gen_met = np.sqrt(np.sum(gen_px**2+gen_py**2, axis=1))
-        pred_met = np.sqrt(np.sum(pred_px**2+pred_py**2, axis=1))
-        cand_met = np.sqrt(np.sum(cand_px**2+cand_py**2, axis=1))
+        gen_met = np.sqrt(np.sum(gen_px**2 + gen_py**2, axis=1))
+        pred_met = np.sqrt(np.sum(pred_px**2 + pred_py**2, axis=1))
+        cand_met = np.sqrt(np.sum(cand_px**2 + cand_py**2, axis=1))
 
         with self.writer.as_default():
-            jet_ratio = yvals["jets_pt_gen_to_pred"][:, 1]/yvals["jets_pt_gen_to_pred"][:, 0]
+            jet_ratio = yvals["jets_pt_gen_to_pred"][:, 1] / yvals["jets_pt_gen_to_pred"][:, 0]
 
             plt.figure()
-            b = np.linspace(0,5,100)
-            plt.hist(yvals["jets_pt_gen_to_cand"][:, 1]/yvals["jets_pt_gen_to_cand"][:, 0], bins=b, histtype="step", lw=2)
-            plt.hist(yvals["jets_pt_gen_to_pred"][:, 1]/yvals["jets_pt_gen_to_pred"][:, 0], bins=b, histtype="step", lw=2)
-            plt.savefig(str(cp_dir/"jet_res.png"), bbox_inches="tight", dpi=100)
+            b = np.linspace(0, 5, 100)
+            plt.hist(yvals["jets_pt_gen_to_cand"][:, 1] / yvals["jets_pt_gen_to_cand"][:, 0], bins=b, histtype="step", lw=2)
+            plt.hist(yvals["jets_pt_gen_to_pred"][:, 1] / yvals["jets_pt_gen_to_pred"][:, 0], bins=b, histtype="step", lw=2)
+            plt.savefig(str(cp_dir / "jet_res.png"), bbox_inches="tight", dpi=100)
             plt.clf()
 
             plt.figure()
-            b = np.linspace(0,5,100)
-            plt.hist(cand_met/gen_met, bins=b, histtype="step", lw=2)
-            plt.hist(pred_met/gen_met, bins=b, histtype="step", lw=2)
-            plt.savefig(str(cp_dir/"met_res.png"), bbox_inches="tight", dpi=100)
+            b = np.linspace(0, 5, 100)
+            plt.hist(cand_met / gen_met, bins=b, histtype="step", lw=2)
+            plt.hist(pred_met / gen_met, bins=b, histtype="step", lw=2)
+            plt.savefig(str(cp_dir / "met_res.png"), bbox_inches="tight", dpi=100)
             plt.clf()
 
-            tf.summary.histogram(
-                "jet_pt_pred_over_gen", jet_ratio,
-                step=epoch-1,
-                buckets=None,
-                description=None
-            )
-            tf.summary.scalar(
-                "jet_pt_pred_over_gen_mean", np.mean(jet_ratio), step=epoch-1, description=None
-            )
-            tf.summary.scalar(
-                "jet_pt_pred_over_gen_std", np.std(jet_ratio), step=epoch-1, description=None
-            )
-
-
-            tf.summary.histogram(
-                "met_pred_over_gen", pred_met/gen_met,
-                step=epoch-1,
-                buckets=None,
-                description=None
-            )
-            tf.summary.scalar(
-                "met_pred_over_gen_mean", np.mean(pred_met/gen_met), step=epoch-1, description=None
-            )
-            tf.summary.scalar(
-                "met_pred_over_gen_std", np.std(pred_met/gen_met), step=epoch-1, description=None
-            )
+            tf.summary.histogram("jet_pt_pred_over_gen", jet_ratio, step=epoch - 1, buckets=None, description=None)
+            tf.summary.scalar("jet_pt_pred_over_gen_mean", np.mean(jet_ratio), step=epoch - 1, description=None)
+            tf.summary.scalar("jet_pt_pred_over_gen_std", np.std(jet_ratio), step=epoch - 1, description=None)
+
+            tf.summary.histogram("met_pred_over_gen", pred_met / gen_met, step=epoch - 1, buckets=None, description=None)
+            tf.summary.scalar("met_pred_over_gen_mean", np.mean(pred_met / gen_met), step=epoch - 1, description=None)
+            tf.summary.scalar("met_pred_over_gen_std", np.std(pred_met / gen_met), step=epoch - 1, description=None)
 
 
 def prepare_callbacks(
-        config,
-        outdir,
-        dataset,
-        comet_experiment=None,
-        horovod_enabled=False,
-    ):
+    config,
+    outdir,
+    dataset,
+    comet_experiment=None,
+    horovod_enabled=False,
+):
 
     callbacks = []
     terminate_cb = tf.keras.callbacks.TerminateOnNaN()
     callbacks += [terminate_cb]
 
-    if not horovod_enabled or hvd.rank()==0:
+    if not horovod_enabled or hvd.rank() == 0:
         callbacks += get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, horovod_enabled)
 
     return callbacks
 
+
 def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, horovod_enabled):
     callbacks = []
     cp_dir = Path(outdir) / "weights"
@@ -233,22 +172,23 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h
 
     history_path = Path(outdir) / "history"
     history_path.mkdir(parents=True, exist_ok=True)
-    history_path = str(history_path)    
+    history_path = str(history_path)
     cb = CustomCallback(
         history_path,
         dataset.take(config["setup"]["num_events_validation"]),
         config,
         plot_freq=config["callbacks"]["plot_freq"],
-        horovod_enabled=horovod_enabled
+        horovod_enabled=horovod_enabled,
     )
 
     callbacks += [cb]
     tb = CustomTensorBoard(
         log_dir=outdir + "/logs",
         histogram_freq=config["callbacks"]["tensorboard"]["hist_freq"],
-        write_graph=False, write_images=False,
+        write_graph=False,
+        write_images=False,
         update_freq="epoch",
-        #profile_batch=(10,90),
+        # profile_batch=(10,90),
         profile_batch=0,
         dump_history=config["callbacks"]["tensorboard"]["dump_history"],
     )
@@ -258,7 +198,8 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h
 
     return callbacks
 
-def get_rundir(base='experiments'):
+
+def get_rundir(base="experiments"):
     if not os.path.exists(base):
         os.makedirs(base)
 
@@ -266,28 +207,23 @@ def get_rundir(base='experiments'):
     if len(previous_runs) == 0:
         run_number = 1
     else:
-        run_number = max([int(s.split('run_')[1]) for s in previous_runs]) + 1
-
-    logdir = 'run_%02d' % run_number
-    return '{}/{}'.format(base, logdir)
+        run_number = max([int(s.split("run_")[1]) for s in previous_runs]) + 1
 
-
-def scale_outputs(X,y,w):
-    ynew = y-out_m
-    ynew = ynew/out_s
-    return X, ynew, w
+    logdir = "run_%02d" % run_number
+    return "{}/{}".format(base, logdir)
 
 
 def make_model(config, dtype):
-    model = config['parameters']['model']
+    model = config["parameters"]["model"]
 
-    if model == 'transformer':
+    if model == "transformer":
         return make_transformer(config, dtype)
-    elif model == 'gnn_dense':
+    elif model == "gnn_dense":
         return make_gnn_dense(config, dtype)
 
     raise KeyError("Unknown model type {}".format(model))
 
+
 def make_gnn_dense(config, dtype):
 
     parameters = [
@@ -302,50 +238,50 @@ def make_gnn_dense(config, dtype):
         "skip_connection",
         "output_decoding",
         "combined_graph_layer",
-        "debug"
+        "debug",
     ]
 
     kwargs = {}
     for par in parameters:
-        if par in config['parameters'].keys():
-            kwargs[par] = config['parameters'][par]
+        if par in config["parameters"].keys():
+            kwargs[par] = config["parameters"][par]
 
     model = PFNetDense(
         multi_output=config["setup"]["multi_output"],
         num_input_classes=config["dataset"]["num_input_classes"],
         num_output_classes=config["dataset"]["num_output_classes"],
         schema=config["dataset"]["schema"],
-        event_set_output=config["loss"]["event_loss"]!="none",
+        event_set_output=config["loss"]["event_loss"] != "none",
         **kwargs
     )
 
     return model
 
+
 def make_transformer(config, dtype):
-    parameters = [
-        "input_encoding",
-        "output_decoding"
-    ]
+    parameters = ["input_encoding", "output_decoding"]
     kwargs = {}
     for par in parameters:
-        if par in config['parameters'].keys():
-            kwargs[par] = config['parameters'][par]
+        if par in config["parameters"].keys():
+            kwargs[par] = config["parameters"][par]
 
     model = PFNetTransformer(
         multi_output=config["setup"]["multi_output"],
         num_input_classes=config["dataset"]["num_input_classes"],
         num_output_classes=config["dataset"]["num_output_classes"],
         schema=config["dataset"]["schema"],
-        event_set_output=config["loss"]["event_loss"]!="none",
+        event_set_output=config["loss"]["event_loss"] != "none",
         **kwargs
     )
     return model
 
-def deltar(a,b):
+
+def deltar(a, b):
     return a.deltaR(b)
 
-#Given a model, evaluates it on each batch of the validation dataset
-#For each batch, save the inputs, the generator-level target, the candidate-level target, and the prediction
+
+# Given a model, evaluates it on each batch of the validation dataset
+# For each batch, save the inputs, the generator-level target, the candidate-level target, and the prediction
 def eval_model(model, dataset, config, outdir):
 
     ibatch = 0
@@ -371,14 +307,13 @@ def eval_model(model, dataset, config, outdir):
         jets_const = {}
         for typ in ["gen", "cand", "pred"]:
             cls_id = np.argmax(outs["{}_cls".format(typ)], axis=-1)
-            valid = cls_id!=0
+            valid = cls_id != 0
             pt = awkward.from_iter([y[m][:, 0] for y, m in zip(outs["{}_pt".format(typ)], valid)])
             eta = awkward.from_iter([y[m][:, 0] for y, m in zip(outs["{}_eta".format(typ)], valid)])
 
             phi = np.arctan2(outs["{}_sin_phi".format(typ)], outs["{}_cos_phi".format(typ)])
             phi = awkward.from_iter([y[m][:, 0] for y, m in zip(phi, valid)])
             e = awkward.from_iter([y[m][:, 0] for y, m in zip(outs["{}_energy".format(typ)], valid)])
-            idx_to_elem = awkward.from_iter([np.arange(len(m))[m] for m in valid])
 
             vec = vector.arr({"pt": pt, "eta": eta, "phi": phi, "e": e})
 
@@ -394,108 +329,113 @@ def eval_model(model, dataset, config, outdir):
             outs["jets_cand_{}".format(key)] = awkward.to_numpy(awkward.flatten(getattr(jets_coll["cand"], key)))
             outs["jets_pred_{}".format(key)] = awkward.to_numpy(awkward.flatten(getattr(jets_coll["pred"], key)))
 
-        #DeltaR match between genjets and PF/MLPF jets
+        # DeltaR match between genjets and PF/MLPF jets
         cart = awkward.cartesian([jets_coll["gen"], jets_coll["pred"]], nested=True)
         jets_a, jets_b = awkward.unzip(cart)
         drs = deltar(jets_a, jets_b)
-        match_gen_to_pred = [awkward.where(d<0.1) for d in drs]
+        match_gen_to_pred = [awkward.where(d < 0.1) for d in drs]
         m0 = awkward.from_iter([m[0] for m in match_gen_to_pred])
         m1 = awkward.from_iter([m[1] for m in match_gen_to_pred])
         j1s = jets_coll["gen"][m0]
         j2s = jets_coll["pred"][m1]
 
-        outs["jets_pt_gen_to_pred"] = np.stack([awkward.to_numpy(awkward.flatten(j1s.pt)), awkward.to_numpy(awkward.flatten(j2s.pt))], axis=-1)
+        outs["jets_pt_gen_to_pred"] = np.stack(
+            [awkward.to_numpy(awkward.flatten(j1s.pt)), awkward.to_numpy(awkward.flatten(j2s.pt))], axis=-1
+        )
 
         cart = awkward.cartesian([jets_coll["gen"], jets_coll["cand"]], nested=True)
         jets_a, jets_b = awkward.unzip(cart)
         drs = deltar(jets_a, jets_b)
-        match_gen_to_pred = [awkward.where(d<0.1) for d in drs]
+        match_gen_to_pred = [awkward.where(d < 0.1) for d in drs]
         m0 = awkward.from_iter([m[0] for m in match_gen_to_pred])
         m1 = awkward.from_iter([m[1] for m in match_gen_to_pred])
         j1s = jets_coll["gen"][m0]
         j2s = jets_coll["cand"][m1]
 
-        outs["jets_pt_gen_to_cand"] = np.stack([awkward.to_numpy(awkward.flatten(j1s.pt)), awkward.to_numpy(awkward.flatten(j2s.pt))], axis=-1)
-
-        np.savez(
-            np_outfile,
-            X=elem["X"],
-            **outs
+        outs["jets_pt_gen_to_cand"] = np.stack(
+            [awkward.to_numpy(awkward.flatten(j1s.pt)), awkward.to_numpy(awkward.flatten(j2s.pt))], axis=-1
         )
-        
+
+        np.savez(np_outfile, X=elem["X"], **outs)
+
         ibatch += 1
 
+
 def freeze_model(model, config, outdir):
-    bin_size = config["parameters"]["combined_graph_layer"]["bin_size"]
     num_features = config["dataset"]["num_input_features"]
-    num_out_classes = config["dataset"]["num_output_classes"]
 
     def model_output(ret):
-        return tf.concat([ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1)
+        return tf.concat(
+            [ret["cls"], ret["charge"], ret["pt"], ret["eta"], ret["sin_phi"], ret["cos_phi"], ret["energy"]], axis=-1
+        )
+
     full_model = tf.function(lambda x: model_output(model(x, training=False)))
 
-    #we need to use opset 12 for the version of ONNXRuntime in CMSSW
-    #the warnings "RuntimeError: Opset (12) must be >= 13 for operator 'batch_dot'." do not seem to be critical
+    # we need to use opset 12 for the version of ONNXRuntime in CMSSW
+    # the warnings "RuntimeError: Opset (12) must be >= 13 for operator 'batch_dot'." do not seem to be critical
     model_proto, _ = tf2onnx.convert.from_function(
         full_model,
         opset=12,
-        input_signature=(tf.TensorSpec((None, None, num_features), tf.float32, name="x:0"), ),
-        output_path=str(Path(outdir) / "model.onnx")
+        input_signature=(tf.TensorSpec((None, None, num_features), tf.float32, name="x:0"),),
+        output_path=str(Path(outdir) / "model.onnx"),
     )
 
+
 class FlattenedCategoricalAccuracy(tf.keras.metrics.CategoricalAccuracy):
     def __init__(self, use_weights=False, **kwargs):
         super(FlattenedCategoricalAccuracy, self).__init__(**kwargs)
         self.use_weights = use_weights
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-        #flatten the batch dimension
-        _y_true = tf.reshape(y_true, (tf.shape(y_true)[0]*tf.shape(y_true)[1], tf.shape(y_true)[2]))
-        _y_pred = tf.reshape(y_pred, (tf.shape(y_pred)[0]*tf.shape(y_pred)[1], tf.shape(y_pred)[2]))
+        # flatten the batch dimension
+        _y_true = tf.reshape(y_true, (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[2]))
+        _y_pred = tf.reshape(y_pred, (tf.shape(y_pred)[0] * tf.shape(y_pred)[1], tf.shape(y_pred)[2]))
         sample_weights = None
 
         if self.use_weights:
-            sample_weights = _y_true*tf.reduce_sum(_y_true, axis=0)
-            sample_weights = 1.0/sample_weights[sample_weights!=0]
+            sample_weights = _y_true * tf.reduce_sum(_y_true, axis=0)
+            sample_weights = 1.0 / sample_weights[sample_weights != 0]
 
         super(FlattenedCategoricalAccuracy, self).update_state(_y_true, _y_pred, sample_weights)
 
+
 class SingleClassRecall(Recall):
     def __init__(self, icls, **kwargs):
         super(SingleClassRecall, self).__init__(**kwargs)
         self.icls = icls
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-        #flatten the batch dimension
-        _y_true = tf.reshape(y_true, (tf.shape(y_true)[0]*tf.shape(y_true)[1], tf.shape(y_true)[2]))
-        _y_pred = tf.argmax(tf.reshape(y_pred, (tf.shape(y_pred)[0]*tf.shape(y_pred)[1], tf.shape(y_pred)[2])), axis=-1)
-        super(SingleClassRecall, self).update_state(
-            _y_true[:, self.icls],
-            tf.cast(_y_pred==self.icls, tf.float32)
-        )
+        # flatten the batch dimension
+        _y_true = tf.reshape(y_true, (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[2]))
+        _y_pred = tf.argmax(tf.reshape(y_pred, (tf.shape(y_pred)[0] * tf.shape(y_pred)[1], tf.shape(y_pred)[2])), axis=-1)
+        super(SingleClassRecall, self).update_state(_y_true[:, self.icls], tf.cast(_y_pred == self.icls, tf.float32))
+
 
 class FlattenedMeanIoU(tf.keras.metrics.MeanIoU):
     def __init__(self, use_weights=False, **kwargs):
         super(FlattenedMeanIoU, self).__init__(**kwargs)
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-        #flatten the batch dimension
-        _y_true = tf.reshape(y_true, (tf.shape(y_true)[0]*tf.shape(y_true)[1], tf.shape(y_true)[2]))
-        _y_pred = tf.reshape(y_pred, (tf.shape(y_pred)[0]*tf.shape(y_pred)[1], tf.shape(y_pred)[2]))
+        # flatten the batch dimension
+        _y_true = tf.reshape(y_true, (tf.shape(y_true)[0] * tf.shape(y_true)[1], tf.shape(y_true)[2]))
+        _y_pred = tf.reshape(y_pred, (tf.shape(y_pred)[0] * tf.shape(y_pred)[1], tf.shape(y_pred)[2]))
         super(FlattenedMeanIoU, self).update_state(_y_true, _y_pred, None)
 
+
 class LearningRateLoggingCallback(tf.keras.callbacks.Callback):
     def on_epoch_end(self, epoch, numpy_logs):
         try:
             lr = self.model.optimizer._decayed_lr(tf.float32).numpy()
-            tf.summary.scalar('learning rate', data=lr, step=epoch)
+            tf.summary.scalar("learning rate", data=lr, step=epoch)
         except AttributeError as e:
+            print(e)
             pass
 
+
 def configure_model_weights(model, trainable_layers):
     print("setting trainable layers: {}".format(trainable_layers))
 
-    if (trainable_layers is None):
+    if trainable_layers is None:
         trainable_layers = "all"
 
     if trainable_layers == "all":
@@ -522,11 +462,15 @@ def configure_model_weights(model, trainable_layers):
     non_trainable_count = sum([np.prod(tf.keras.backend.get_value(w).shape) for w in model.non_trainable_weights])
     print("trainable={} non_trainable={}".format(trainable_count, non_trainable_count))
 
+
 def make_focal_loss(config):
-    def loss(x,y):
-        return tfa.losses.sigmoid_focal_crossentropy(x,y,
+    def loss(x, y):
+        return tfa.losses.sigmoid_focal_crossentropy(
+            x,
+            y,
             alpha=float(config["setup"].get("focal_loss_alpha", 0.25)),
             gamma=float(config["setup"].get("focal_loss_gamma", 2.0)),
-            from_logits=bool(config["setup"].get("focal_loss_from_logits", False))
+            from_logits=bool(config["setup"].get("focal_loss_from_logits", False)),
         )
+
     return loss
diff --git a/mlpf/tfmodel/mpnn.py b/mlpf/tfmodel/mpnn.py
deleted file mode 100644
index 9d1477bb5..000000000
--- a/mlpf/tfmodel/mpnn.py
+++ /dev/null
@@ -1,291 +0,0 @@
-import tensorflow as tf
-
-class EdgeNetwork(tf.keras.Model):
-    """EdgeNetwork is a choice for message function that allow vector valued edge features.
-
-    M(h_v, h_w, e_{vw}) = A(e_{vw})h_w. where A is a neural network which maps the edge vector e_{vw} to
-    a d x d matrix. where d is the dimension of node state vector.
-
-    Here we have the simplest nn - relu(linear)
-    """
-    def __init__(self, state_dim, name='edgenetwork'):
-        super(EdgeNetwork, self).__init__(name=name)
-        self.state_dim = state_dim
-        self.nn = tf.keras.layers.Dense(units=state_dim ** 2, activation=tf.nn.relu)
-
-    def call(self, states, edges):
-        """
-        Input
-        -----
-            states:   bs x #nodes^2 x state_dim
-            edges:    bs x #nodes^2 x #edge_features
-
-        Output
-        ------
-            messages: bs x #nodes^2 x state_dim
-
-        Map edge vectors to d x d matrices. Reshape both states and edges to do matrix mulltiplication.
-        the matrix mutltiplication is doing dot products between:
-            <embedded edge from node_i to node_j in graph k> and <state vector of node_j from graph k>
-        The output message tensor represents:
-        [
-            [
-                <message from node_i to node_j in graph_k>
-                for i in range(n_nodes)
-                    for j in range(n_nodes)
-            ] for k in range(n_graph)
-        ]
-        """
-        total_edges = tf.shape(edges)[1]
-        state_dim = self.state_dim
-
-        Ae_vw = self.nn(edges)                                           # bs x #nodes^2 x state_dim^2
-        Ae_vw = tf.reshape(Ae_vw, [-1, state_dim, state_dim])            # bs * #nodes^2 x state_dim x state_dim
-        states = tf.reshape(states, [-1, state_dim, 1])                  # bs * #nodes^2 x state_dim x 1
-        messages = tf.matmul(Ae_vw, states)                       # bs * #nodes^2 x state_dim x 1
-        messages = tf.reshape(messages, [-1, total_edges, state_dim])    # bs x #nodes^2 x state_dim
-        return messages
-
-
-class Aggregation(tf.keras.Model):
-    def __init__(self, method='sum', axis=2, name='aggregation'): 
-        assert method in ['sum', 'mean'], 'Unsupported aggregation method'
-        super(Aggregation, self).__init__(name=name)
-        self.method = method
-        self.axis = axis
-
-    def call(self, x, keepdims= False, *args , **kwargs): #KYR: added keepdims
-        if self.method == 'sum':
-            return tf.reduce_sum(x, self.axis, keepdims)
-        else:
-            return tf.reduce_mean(messages, self.axis)
-
-
-class UpdateFunction(tf.keras.Model):
-    """Node states update function via GRU.
-
-    U_t = GRU(h_v^t, m_v^{t+1})
-
-    The same update function is used at each time step t.
-    """
-    def __init__(self, state_dim, name='message_update_function'):
-        super(UpdateFunction, self).__init__(name=name)
-        self.state_dim = state_dim
-        self.concat = tf.keras.layers.Concatenate(axis=1)
-        self.GRU = tf.keras.layers.GRU(units=state_dim)
-
-    def call(self, states, messages):
-        """
-        Input
-        -----
-            states:   bs x #nodes x state_dim
-            messages: bs x #nodes x state_dim
-        """
-        num_nodes = tf.shape(states)[1]
-        state_dim = self.state_dim
-        states = tf.reshape(states, [-1, 1, state_dim])
-        messages = tf.reshape(messages, [-1, 1, state_dim])
-        concat = self.concat([states, messages])
-        updated_messages = self.GRU(concat)
-        updated_messages = tf.reshape(updated_messages, [-1, num_nodes, state_dim])
-        return updated_messages
-
-
-class MessagePassing(tf.keras.Model):
-    """
-    > The message passing phrase runs for T time steps and is defined in terms of
-        1. message function M_t
-        2. vertex update function U_t
-      during the message passing phase, hidden states h_v^t at each node in the graph are updated based on
-      messages m_v^{t+1} according to:
-        1. m_v^{t+1} = \Sigma_{w \in N(v)}{M_t(h_v^t, h_w^t, e_{vw})}
-        2. h_v^{t+1} = U_t(h_v^t, m_v^{t+1})
-
-    To generalize a bit, we can use other aggregation function instead of summation.
-    """
-    def __init__(self, state_dim, name='message_passing'):
-        super(MessagePassing, self).__init__(self, name=name)
-        self.state_dim = state_dim
-        self.message_function = EdgeNetwork(state_dim=state_dim, name=name + '/message_func')
-        self.message_aggregation = Aggregation(name=name + '/message_agg')
-        self.update_function = UpdateFunction(state_dim=state_dim, name=name + '/state_update')
-
-    def call(self, states, edges, masks, training=False):
-        """
-        Input
-        -----
-            nodes: bs x #nodes x state_dim
-            edges: bs x #nodes^2 x #edge_features
-            masks: bs x #nodes^2 x 1               binary matrix indicating whether edge exist or not
-
-        """
-        num_nodes = tf.shape(states)[1]
-        state_dim = tf.shape(states)[2]
-        masks = tf.reshape(masks, [-1, num_nodes ** 2, 1])
-        states_j = tf.tile(states, [1, num_nodes, 1])
-        messages = self.message_function(states_j, edges)
-        masked_messages = tf.multiply(messages, masks)
-        # reshape to batch, from_node, to_node, message
-        masked_messages = tf.reshape(masked_messages, [-1, num_nodes, num_nodes, state_dim])
-        aggregated_messages = self.message_aggregation(masked_messages)
-        updated_messages = self.update_function(states, aggregated_messages)
-        return updated_messages
-
-
-class ReadoutEdge(tf.keras.Model):
-    def __init__(self, hidden_sizes, num_outputs, name='readout_edges'):
-        super(ReadoutEdge, self).__init__(name=name)
-        self.concat = tf.keras.layers.Concatenate()
-        self.hidden_layers = tf.keras.Sequential([
-            tf.keras.layers.Dense(units=hidden_size, activation='relu', name=name + '/hidden_{}'.format(i))
-        for i, hidden_size in enumerate(hidden_sizes)])
-        self.last_linear = tf.keras.layers.Dense(units=num_outputs, name=name + '/last_linear')
-
-
-    def call(self, states, edges, training=False):
-        num_nodes = tf.shape(states)[1]
-        state_dim = tf.shape(states)[2]
-        states_i = tf.reshape(tf.tile(states, [1, 1, num_nodes]), [-1, num_nodes ** 2, state_dim])  #
-        states_j = tf.tile(states, [1, num_nodes, 1])                                               # 
-        concat = self.concat([states_i, edges, states_j])
-        features = self.hidden_layers(concat)
-        output = self.last_linear(features)
-        return output
-
-
-class ReadoutNodes(tf.keras.Model):
-    def __init__(self, hidden_sizes, num_outputs, name='readout_nodes'):
-        super(ReadoutNodes, self).__init__(name=name)
-        self.hidden_layers = tf.keras.Sequential([
-            tf.keras.layers.Dense(units=hidden_size, activation='relu', name=name + '/hidden_{}'.format(i))
-        for i, hidden_size in enumerate(hidden_sizes)])
-        self.last_linear = tf.keras.layers.Dense(units=num_outputs, name=name + '/last_linear')
-
-    def call(self, states, training=False):
-        features = self.hidden_layers(states)
-        output = self.last_linear(features)
-        return output
-
-
-class ReadoutGraph(tf.keras.Model):
-    def __init__(self, hidden_sizes, num_outputs, agg_function, name='readout_graph'):
-        super(ReadoutGraph, self).__init__(name=name)
-        self.agg_function = agg_function
-        self.hidden_layers = tf.keras.Sequential([
-            tf.keras.layers.Dense(units=hidden_size, activation='relu', name=name + '/hidden_{}'.format(i))
-        for i, hidden_size in enumerate(hidden_sizes)])
-        self.last_linear = tf.keras.layers.Dense(units=num_outputs, name=name + '/last_linear')
-
-
-    def call(self, states, masks, training=False):
-        num_nodes = tf.shape(states)[1]
-        masks = tf.reshape(masks, [-1, num_nodes, 1])
-        masked_states = tf.multiply(states, masks)
-        graph_states = self.agg_function(masked_states, keepdims = True) #KEPT DIMENSIONS
-        features = self.hidden_layers(graph_states)
-        output = self.last_linear(features)
-        return output
-
-
-class MPNN(tf.keras.Model):
-    """Implementation of Message Passing Neural Network.
-
-    reference: https://arxiv.org/abs/1704.01212i
-    """
-    def __init__(self, hidden_sizes, num_outputs, state_dim, update_steps, name='mpnn'):
-        super(MPNN, self).__init__(name=name)
-        self.update_steps = int(update_steps)
-        self.node_embedding = tf.keras.layers.Dense(units=state_dim, activation='relu')
-        self.message_passing = MessagePassing(state_dim=state_dim)
-        self.readout_func = ReadoutGraph(hidden_sizes, num_outputs, Aggregation('sum', 1))
-
-    def call(self, nodes, edges, node_masks=None, edge_masks=None, training=False):
-        states = self.node_embedding(nodes)
-        for time_step in range(self.update_steps):
-            states = self.message_passing(states, edges, edge_masks, training=training)
-        readout = self.readout_func(states, node_masks, training=training)
-        return readout
-
-
-def _test_edgenetwork():
-    """testcase for edgenetwork forward pass
-    a batch of 32 graphs, each with 3 nodes, include self pointing eage - 9 edges per graph, each node has 5 features.
-    """
-    edges = tf.random.uniform((32, 9, 5))
-    states = tf.tile(tf.random.uniform((32, 3, 3)), [1, 3, 1])
-    m = EdgeNetwork(state_dim=3)
-    o = m(states, edges)
-    assert o.shape == (32, 9, 3)
-
-
-def _test_message_update():
-    states = tf.random.uniform((32, 9, 3))
-    messages = tf.random.uniform((32, 9, 3))
-    m = UpdateFunction(3)
-    o = m(states, messages)
-    assert o.shape == (32, 9, 3)
-
-
-def _test_message_passing():
-    """test case for message passing.
-
-    a batch of 2 graphs, each has 2 nodes, each node has a state vector of size 2, each edge has 3 features.
-    """
-    states = tf.convert_to_tensor([[[1, 2], [2, 1]], [[3, 4], [4, 3]]], dtype='float')
-    edges = tf.convert_to_tensor([
-        [[0, 0, 0], [1, 2, 3], [3, 2, 1], [0, 0, 0]], [[0, 0, 0], [3, 4, 2], [2, 4, 3], [0, 0, 0]]
-    ], dtype='float')
-    masks = tf.expand_dims(tf.convert_to_tensor([[[0], [1], [1], [0]], [[0], [1], [1], [0]]], dtype='float'), axis=-1)
-    m = MessagePassing(2)
-    o = m(states, edges, masks)
-    assert o.shape == (2, 2, 2)
-
-
-def _test_edge_readout():
-    states = tf.random.uniform((32, 3, 3))
-    edges = tf.random.uniform((32, 9, 2))
-    m = ReadoutEdge([3, 2], 1)
-    o = m(states, edges)
-    assert o.shape == (32, 9, 1)
-
-
-def _test_node_readout():
-    states = tf.random.uniform((32, 3, 3))
-    m = ReadoutNodes([3, 2], 1)
-    o = m(states)
-    assert o.shape == (32, 3, 1)
-
-
-def _test_graph_readout():
-    states = tf.random.uniform((32, 3, 3))
-    masks = tf.expand_dims(
-        tf.convert_to_tensor([[1, 1, 0]] * 8 + [[1, 0, 1]] * 8 + [[0, 1, 1]] * 16, dtype='float'),
-        axis=-1)
-    agg_func = Aggregation(method='sum', axis=1)
-    m = ReadoutGraph([3, 3], 1, agg_func)
-    o = m(states, masks)
-    assert o.shape == (32, 1)
-
-
-def _test_mpnn():
-    nodes = tf.random.uniform((32, 3, 3))
-    edges = tf.random.uniform((32, 3 * 3, 2))
-    node_masks = tf.expand_dims(
-        tf.convert_to_tensor([[1, 1, 0]] * 8 + [[1, 0, 1]] * 10 + [[0, 1, 1]] * 14, dtype='float'),
-        axis=-1)
-    edge_masks = tf.expand_dims(
-        tf.convert_to_tensor([[0, 1, 0, 1, 0, 1, 0, 1, 0]] * 16 + [[0, 1, 1, 1, 0, 0, 1, 0, 0]] * 16, dtype='float'),
-        axis=-1)
-    m = MPNN([5, 5,], 1, 8, 3)
-    o = m(nodes, edges, node_masks=node_masks, edge_masks=edge_masks)
-    print("-----THIS WORKS ------")
-    assert o.shape == (32, 1)
-
-if __name__ == '__main__':
-    _test_edgenetwork()
-    _test_message_update()
-    _test_message_passing()
-    _test_edge_readout()
-    _test_node_readout()
-    _test_graph_readout()
-    _test_mpnn()
diff --git a/mlpf/tfmodel/opt.py b/mlpf/tfmodel/opt.py
deleted file mode 100644
index 26ca0e86e..000000000
--- a/mlpf/tfmodel/opt.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from tensorboard.plugins.hparams import api as hp
-import tensorflow as tf
-from tf_model import load_dataset_ttbar, my_loss_full, num_max_elems, weight_schemes, PFNet
-from tf_model import cls_130, cls_211, cls_22, energy_resolution, eta_resolution, phi_resolution
-from argparse import Namespace
-import kerastuner as kt
-
-args = Namespace()
-args.datapath = "./data/TTbar_14TeV_TuneCUETP8M1_cfi"
-args.ntrain = 10000
-args.ntest = 1000
-args.weights = "inverse"
-args.convlayer = "ghconv"
-args.batch_size = 1
-args.nepochs = 20
-args.target = "cand"
-args.lr = 0.0001
-args.outdir = "testout"
-
-def model_builder(hp):
-    args.hidden_dim_id = hp.Choice('hidden_dim_id', values = [16, 32, 64, 128, 256])
-    args.hidden_dim_reg = hp.Choice('hidden_dim_reg', values = [16, 32, 64, 128, 256])
-    args.num_hidden_id_enc = hp.Choice('hidden_dim_id_enc', values = [0, 1, 2, 3])
-    args.num_hidden_id_dec = hp.Choice('hidden_dim_id_dec', values = [0, 1, 2, 3])
-    args.num_hidden_reg_enc = hp.Choice('hidden_dim_reg_enc', values = [0, 1, 2, 3])
-    args.num_hidden_reg_dec = hp.Choice('hidden_dim_reg_dec', values = [0, 1, 2, 3])
-    args.num_convs_id = hp.Choice('num_convs_id', values = [1, 2, 3, 4])
-    args.num_convs_reg = hp.Choice('num_convs_reg', values = [1, 2, 3, 4])
-    args.distance_dim = hp.Choice('distance_dim', values = [16, 32, 64, 128, 256])
-    args.num_neighbors = hp.Choice('num_neighbors', [2, 3, 4, 5, 6, 7, 8, 9, 10])
-    args.dropout = hp.Choice('dropout', values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
-    args.bin_size = hp.Choice('bin_size', values = [100, 200, 500, 1000])
-    args.dist_mult = hp.Choice('dist_mult', values = [0.1, 1.0, 10.0])
-    args.cosine_dist = hp.Choice('cosine_dist', values = [True, False])
-
-    model = PFNet(
-        num_hidden_id_enc=args.num_hidden_id_enc,
-        num_hidden_id_dec=args.num_hidden_id_dec,
-        hidden_dim_id=args.hidden_dim_id,
-        num_hidden_reg_enc=args.num_hidden_reg_enc,
-        num_hidden_reg_dec=args.num_hidden_reg_dec,
-        hidden_dim_reg=args.hidden_dim_reg,
-        num_convs_id=args.num_convs_id,
-        num_convs_reg=args.num_convs_reg,
-        distance_dim=args.distance_dim,
-        convlayer=args.convlayer,
-        dropout=args.dropout,
-        bin_size=args.bin_size,
-        num_neighbors=args.num_neighbors,
-        dist_mult=args.dist_mult,
-        cosine_dist=args.cosine_dist
-    )
-    loss_fn = my_loss_full
-    opt = tf.keras.optimizers.Adam(learning_rate=args.lr)
-    print(args)
-
-    model.compile(optimizer=opt, loss=loss_fn,
-        metrics=[cls_130, cls_211, cls_22, energy_resolution, eta_resolution, phi_resolution],
-        sample_weight_mode="temporal")
-    return model
-
-if __name__ == "__main__":
-    global_batch_size = args.batch_size
-    dataset = load_dataset_ttbar(args.datapath, args.target)
-
-    ps = (tf.TensorShape([num_max_elems, 15]), tf.TensorShape([num_max_elems, 5]), tf.TensorShape([num_max_elems, ]))
-    ds_train = dataset.take(args.ntrain).map(weight_schemes[args.weights]).padded_batch(global_batch_size, padded_shapes=ps)
-    ds_test = dataset.skip(args.ntrain).take(args.ntest).map(weight_schemes[args.weights]).padded_batch(global_batch_size, padded_shapes=ps)
-    
-    ds_train_r = ds_train.repeat()
-    ds_test_r = ds_test.repeat()
-    
-    tuner = kt.Hyperband(
-        model_builder,
-        objective = 'val_loss', 
-        max_epochs = args.nepochs,
-        factor = 3,
-        hyperband_iterations = 3,
-        directory = '/scratch/joosep/kerastuner_out',
-        project_name = 'mlpf')
-    
-    #tuner.search(
-    #    ds_train_r,
-    #    validation_data=ds_test_r,
-    #    steps_per_epoch=args.ntrain/args.batch_size,
-    #    validation_steps=args.ntest/args.batch_size,
-    #    #callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')]
-    #)
-    tuner.results_summary()
-    for trial in tuner.oracle.get_best_trials(num_trials=10):
-        print(trial.hyperparameters.values, trial.score)
diff --git a/mlpf/tfmodel/pred_tf_model.py b/mlpf/tfmodel/pred_tf_model.py
deleted file mode 100644
index 7f2a0bd5f..000000000
--- a/mlpf/tfmodel/pred_tf_model.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import os
-import time
-import glob
-import numpy as np
-import json
-
-from tf_model import parse_args
-
-def get_X(X,y,w):
-    return X
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, default="PFNet", help="type of model to train", choices=["PFNet"])
-    parser.add_argument("--weights", type=str, default=None, help="model weights to load")
-    parser.add_argument("--hidden-dim-id", type=int, default=256, help="hidden dimension")
-    parser.add_argument("--hidden-dim-reg", type=int, default=256, help="hidden dimension")
-    parser.add_argument("--batch-size", type=int, default=1, help="number of events in training batch")
-    parser.add_argument("--num-convs-id", type=int, default=1, help="number of convolution layers")
-    parser.add_argument("--num-convs-reg", type=int, default=1, help="number of convolution layers")
-    parser.add_argument("--num-hidden-id-enc", type=int, default=2, help="number of encoder layers for multiclass")
-    parser.add_argument("--num-hidden-id-dec", type=int, default=2, help="number of decoder layers for multiclass")
-    parser.add_argument("--num-hidden-reg-enc", type=int, default=2, help="number of encoder layers for regression")
-    parser.add_argument("--num-hidden-reg-dec", type=int, default=2, help="number of decoder layers for regression")
-    parser.add_argument("--num-neighbors", type=int, default=5, help="number of knn neighbors")
-    parser.add_argument("--distance-dim", type=int, default=256, help="distance dimension")
-    parser.add_argument("--bin-size", type=int, default=100, help="number of points per LSH bin")
-    parser.add_argument("--dist-mult", type=float, default=1.0, help="Exponential multiplier")
-    parser.add_argument("--num-conv", type=int, default=1, help="number of convolution layers (powers)")
-    parser.add_argument("--attention-layer-cutoff", type=float, default=0.2, help="Sparsify attention matrix by masking values below this threshold")
-    parser.add_argument("--nthreads", type=int, default=-1, help="number of threads to use")
-    parser.add_argument("--ntrain", type=int, default=80, help="number of training events")
-    parser.add_argument("--ntest", type=int, default=20, help="number of testing events")
-    parser.add_argument("--gpu", action="store_true", help="use GPU")
-    parser.add_argument("--synthetic-timing", action="store_true", help="run a synthetic timing check, which is time consuming")
-    parser.add_argument("--convlayer", type=str, default="sgconv", choices=["sgconv", "ghconv"], help="Type of graph convolutional layer")
-    parser.add_argument("--datapath", type=str, help="Input data path", required=True)
-    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="gen")
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    if args.gpu:
-        import setGPU
-    else:
-        os.environ["CUDA_VISIBLE_DEVICES"] = ""
-
-    import tensorflow as tf
-
-    physical_devices = tf.config.list_physical_devices('GPU')
-    if len(physical_devices) > 0:
-        tf.config.experimental.set_memory_growth(physical_devices[0], True)
-    tf.config.experimental_run_functions_eagerly(False)
-
-    from tf_model import num_max_elems
-
-    tf.gfile = tf.io.gfile
-    from tf_model import PFNet, prepare_df
-    from tf_data import _parse_tfr_element
-    tfr_files = glob.glob("{}/tfr/{}/*.tfrecords".format(args.datapath, args.target))
-    assert(len(tfr_files)>0)
-    tf.config.optimizer.set_jit(False)
-
-    if args.nthreads > 0:
-        tf.config.threading.set_inter_op_parallelism_threads(args.nthreads)
-        tf.config.threading.set_intra_op_parallelism_threads(args.nthreads)
-    if not args.gpu:
-        tf.config.set_visible_devices([], 'GPU')
-
-    nev = args.ntest
-    ps = (tf.TensorShape([num_max_elems, 15]), tf.TensorShape([num_max_elems, 5]), tf.TensorShape([num_max_elems, ]))
-    dataset = tf.data.TFRecordDataset(tfr_files).map(
-        _parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE).skip(args.ntrain).take(nev).padded_batch(args.batch_size, padded_shapes=ps)
-    dataset_X = dataset.map(get_X)
-
-    base_model = PFNet(
-        hidden_dim_id=args.hidden_dim_id,
-        hidden_dim_reg=args.hidden_dim_reg,
-        num_convs_id=args.num_convs_id,
-        num_convs_reg=args.num_convs_reg,
-        num_hidden_id_enc=args.num_hidden_id_enc,
-        num_hidden_id_dec=args.num_hidden_id_dec,
-        num_hidden_reg_enc=args.num_hidden_reg_enc,
-        num_hidden_reg_dec=args.num_hidden_reg_dec,
-        distance_dim=args.distance_dim,
-        convlayer=args.convlayer,
-        dropout=0.0,
-        bin_size=args.bin_size,
-        num_neighbors=args.num_neighbors,
-        dist_mult=args.dist_mult
-    )
-    model = base_model.create_model(num_max_elems, training=False)
-
-    #load the weights
-    model.load_weights(args.weights)
-    model_dir = os.path.dirname(args.weights)
-
-    #prepare the dataframe
-    prepare_df(model, dataset, model_dir, args.target, save_raw=False)
-
-    print("now timing")
-    neval = 0
-    t0 = time.time()
-    for X in dataset_X:
-        ret = model(X)
-        print(".", end="")
-        neval += 1
-    print()
-    t1 = time.time()
-    time_per_dsrow = (t1-t0)/neval
-    time_per_event = time_per_dsrow/args.batch_size
-    print("prediction time per event: {:.2f} ms".format(1000.0*time_per_event))
-
-    if args.synthetic_timing:
-        synthetic_timing_data = []
-        for iteration in range(3):
-            numev = 500
-            for evsize in [1000, 5000, 10000, 20000]:
-                for batch_size in [1,2,4,10,20] if args.gpu else [1, ]:
-                    t0 = time.time()
-                    for i in range(numev//batch_size):
-                        x = np.random.randn(batch_size, evsize, 15)
-                        model(x)
-                    t1 = time.time()
-                    dt = t1 - t0
-                    time_per_event = 1000.0*(dt / numev)
-                    synthetic_timing_data.append(
-                            [{"iteration": iteration, "batch_size": batch_size, "event_size": evsize, "time_per_event": time_per_event}])
-                    print("Synthetic random data: batch_size={} event_size={}, time={:.2f} ms/ev".format(batch_size, evsize, time_per_event))
-
-        with open("{}/synthetic_timing_gpu{}.json".format(model_dir, int(args.gpu)), "w") as fi:
-            json.dump(synthetic_timing_data, fi)
-
-    #https://leimao.github.io/blog/Save-Load-Inference-From-TF2-Frozen-Graph/
-    # Get frozen ConcreteFunction
-    full_model = tf.function(lambda x: base_model(x, training=False))
-    full_model = full_model.get_concrete_function(
-        tf.TensorSpec((None, None, 15), tf.float32))
-    from tensorflow.python.framework import convert_to_constants
-    frozen_func = convert_to_constants.convert_variables_to_constants_v2(full_model)
-    frozen_func.graph.as_graph_def()
-    print(full_model.graph.inputs)
-    print(full_model.graph.outputs)
-
-    tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
-                      logdir="{}/model_frozen".format(model_dir),
-                      name="frozen_graph.pb",
-                      as_text=False)
-    tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
-                      logdir="{}/model_frozen".format(model_dir),
-                      name="frozen_graph.pbtxt",
-                      as_text=True)
-    #model.save('model', overwrite=True, include_optimizer=False)
diff --git a/mlpf/tfmodel/tf_data.py b/mlpf/tfmodel/tf_data.py
deleted file mode 100644
index a1d0d3939..000000000
--- a/mlpf/tfmodel/tf_data.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import numpy as np
-import glob
-import multiprocessing
-import os
-
-import tensorflow as tf
-from tf_model import load_one_file
-
-def parse_args():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="cand")
-    parser.add_argument("--datapath", type=str, required=True, help="Input data path")
-    parser.add_argument("--num-files-per-tfr", type=int, default=100, help="Number of pickle files to merge to one TFRecord file")
-    args = parser.parse_args()
-    return args
-
-def chunks(lst, n):
-    """Yield successive n-sized chunks from lst."""
-    for i in range(0, len(lst), n):
-        yield lst[i:i + n]
-
-#https://stackoverflow.com/questions/47861084/how-to-store-numpy-arrays-as-tfrecord
-def _bytes_feature(value):
-    """Returns a bytes_list from a string / byte."""
-    if isinstance(value, type(tf.constant(0))): # if value ist tensor
-        value = value.numpy() # get value of tensor
-    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-
-def _parse_tfr_element(element):
-    parse_dic = {
-        'X': tf.io.FixedLenFeature([], tf.string),
-        'y': tf.io.FixedLenFeature([], tf.string),
-        'w': tf.io.FixedLenFeature([], tf.string),
-    }
-    example_message = tf.io.parse_single_example(element, parse_dic)
-
-    X = example_message['X']
-    arr_X = tf.io.parse_tensor(X, out_type=tf.float32)
-    y = example_message['y']
-    arr_y = tf.io.parse_tensor(y, out_type=tf.float32)
-    w = example_message['w']
-    arr_w = tf.io.parse_tensor(w, out_type=tf.float32)
-    
-    #https://github.com/tensorflow/tensorflow/issues/24520#issuecomment-577325475
-    arr_X.set_shape(tf.TensorShape((None, 15)))
-    arr_y.set_shape(tf.TensorShape((None, 5)))
-    arr_w.set_shape(tf.TensorShape((None, )))
-    #inds = tf.stack([arr_dm_row, arr_dm_col], axis=-1)
-    #dm_sparse = tf.SparseTensor(values=arr_dm_data, indices=inds, dense_shape=[tf.shape(arr_X)[0], tf.shape(arr_X)[0]])
-
-    return arr_X, arr_y, arr_w
-
-def serialize_X_y_w(writer, X, y, w):
-    feature = {
-        'X': _bytes_feature(tf.io.serialize_tensor(X)),
-        'y': _bytes_feature(tf.io.serialize_tensor(y)),
-        'w': _bytes_feature(tf.io.serialize_tensor(w)),
-    }
-    sample = tf.train.Example(features=tf.train.Features(feature=feature))
-    writer.write(sample.SerializeToString())
-
-def serialize_chunk(args):
-    path, files, ichunk, target = args
-    out_filename = os.path.join(path, "chunk_{}.tfrecords".format(ichunk))
-    writer = tf.io.TFRecordWriter(out_filename)
-    Xs = []
-    ys = []
-    ws = []
-    dms = []
-
-    for fi in files:
-        X, y, ycand = load_one_file(fi)
-
-        Xs += X
-        if target == "cand":
-            ys += ycand
-        elif target == "gen":
-            ys += y
-        else:
-            raise Exception("Unknown target")
-
-    #set weights for each sample to be equal to the number of samples of this type
-    #in the training script, this can be used to compute either inverse or class-balanced weights
-    uniq_vals, uniq_counts = np.unique(np.concatenate([y[:, 0] for y in ys]), return_counts=True)
-    for i in range(len(ys)):
-        w = np.ones(len(ys[i]), dtype=np.float32)
-        for uv, uc in zip(uniq_vals, uniq_counts):
-            w[ys[i][:, 0]==uv] = uc
-        ws += [w]
-
-    for X, y, w in zip(Xs, ys, ws):
-        serialize_X_y_w(writer, X, y, w)
-
-    writer.close()
-
-if __name__ == "__main__":
-    args = parse_args()
-    tf.config.experimental_run_functions_eagerly(True)
-
-    datapath = args.datapath
-
-    filelist = sorted(glob.glob("{}/raw/*.pkl".format(datapath)))
-    print("found {} files".format(len(filelist)))
-    #means, stds = extract_means_stds(filelist)
-    outpath = "{}/tfr/{}".format(datapath, args.target)
-
-    if not os.path.isdir(outpath):
-        os.makedirs(outpath)
-
-    pars = []
-    for ichunk, files in enumerate(chunks(filelist, args.num_files_per_tfr)):
-        pars += [(outpath, files, ichunk, args.target)]
-    #serialize_chunk(pars[0])
-    pool = multiprocessing.Pool(20)
-    pool.map(serialize_chunk, pars)
-
-    #Load and test the dataset 
-    tfr_dataset = tf.data.TFRecordDataset(glob.glob(outpath + "/*.tfrecords"))
-    dataset = tfr_dataset.map(_parse_tfr_element)
-    num_ev = 0
-    num_particles = 0
-    for X, y, w in dataset:
-        num_ev += 1
-        num_particles += len(X)
-        
-    print("Created TFRecords dataset in {} with {} events, {} particles".format(
-        datapath, num_ev, num_particles))
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index b568e3691..ca401f47c 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -1,21 +1,16 @@
-import os
-import yaml
-from pathlib import Path
 import datetime
+import logging
+import os
 import platform
-import random
-import glob
-import numpy as np
-from tqdm import tqdm
 import re
-import logging
+from pathlib import Path
 
+import numpy as np
 import tensorflow as tf
 import tensorflow_addons as tfa
-
-from tfmodel.data import Dataset
-from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
+import yaml
 from tfmodel.datasets import CMSDatasetFactory, DelphesDatasetFactory
+from tfmodel.onecycle_scheduler import MomentumOneCycleScheduler, OneCycleScheduler
 
 
 @tf.function
@@ -32,9 +27,13 @@ def histogram_2d(eta, phi, weights_px, weights_py, eta_range, phi_range, nbins,
     hist_pt = tf.sqrt(hist_px**2 + hist_py**2)
     return hist_pt
 
+
 @tf.function
 def batched_histogram_2d(eta, phi, w_px, w_py, x_range, y_range, nbins, bin_dtype=tf.float32):
-    return tf.vectorized_map(lambda a: histogram_2d(a[0], a[1], a[2], a[3], x_range, y_range, nbins, bin_dtype), (eta, phi, w_px, w_py))
+    return tf.vectorized_map(
+        lambda a: histogram_2d(a[0], a[1], a[2], a[3], x_range, y_range, nbins, bin_dtype), (eta, phi, w_px, w_py)
+    )
+
 
 def load_config(config_file_path):
     with open(config_file_path, "r") as ymlfile:
@@ -47,8 +46,7 @@ def parse_config(config, ntrain=None, ntest=None, nepochs=None, weights=None):
     config = load_config(config)
 
     tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
-    n_epochs = config["setup"]["num_epochs"]
-    
+
     if ntrain:
         config["setup"]["num_events_train"] = ntrain
 
@@ -110,8 +108,10 @@ def delete_all_but_best_checkpoint(train_dir, dry_run):
 def get_strategy():
     if isinstance(os.environ.get("CUDA_VISIBLE_DEVICES"), type(None)) or len(os.environ.get("CUDA_VISIBLE_DEVICES")) == 0:
         gpus = [-1]
-        print("WARNING: CUDA_VISIBLE_DEVICES variable is empty. \
-            If you don't have or intend to use GPUs, this message can be ignored.")
+        print(
+            "WARNING: CUDA_VISIBLE_DEVICES variable is empty. \
+            If you don't have or intend to use GPUs, this message can be ignored."
+        )
     else:
         gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "-1").split(",")]
     if gpus[0] == -1:
@@ -174,7 +174,7 @@ def get_lr_schedule(config, steps):
     else:
         lr_schedule = None
         callbacks = []
-    return lr_schedule, callbacks,lr
+    return lr_schedule, callbacks, lr
 
 
 def get_optimizer(config, lr_schedule=None):
@@ -188,6 +188,7 @@ def get_optimizer(config, lr_schedule=None):
         opt = tf.keras.optimizers.Adam(learning_rate=lr, amsgrad=cfg_adam["amsgrad"])
         if cfg_adam["pcgrad"]:
             from tfmodel.PCGrad_tf import PCGrad
+
             opt = PCGrad(opt)
         return opt
     if config["setup"]["optimizer"] == "adamw":
@@ -197,11 +198,14 @@ def get_optimizer(config, lr_schedule=None):
         cfg_sgd = config["optimizer"]["sgd"]
         return tf.keras.optimizers.SGD(learning_rate=lr, momentum=cfg_sgd["momentum"], nesterov=cfg_sgd["nesterov"])
     else:
-        raise ValueError("Only 'adam', 'adamw' and 'sgd' are supported optimizers, got {}".format(config["setup"]["optimizer"]))
+        raise ValueError(
+            "Only 'adam', 'adamw' and 'sgd' are supported optimizers, got {}".format(config["setup"]["optimizer"])
+        )
 
 
 def get_tuner(cfg_hypertune, model_builder, outdir, recreate, strategy):
     import keras_tuner as kt
+
     if cfg_hypertune["algorithm"] == "random":
         print("Keras Tuner: Using RandomSearch")
         cfg_rand = cfg_hypertune["random"]
@@ -254,145 +258,63 @@ def compute_weights_none(X, y, w):
 
 
 def make_weight_function(config):
-    def weight_func(X,y,w):
+    def weight_func(X, y, w):
 
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
-        w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
+        w_signal_only = tf.where(y[:, 0] == 0, 0.0, 1.0)
+        w_signal_only *= tf.cast(X[:, 0] != 0, tf.float32)
 
         w_none = tf.ones_like(w)
-        w_none *= tf.cast(X[:, 0]!=0, tf.float32)
+        w_none *= tf.cast(X[:, 0] != 0, tf.float32)
 
-        w_invsqrt = tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w)
-        w_invsqrt *= tf.cast(X[:, 0]!=0, tf.float32)
+        w_invsqrt = tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w)
+        w_invsqrt *= tf.cast(X[:, 0] != 0, tf.float32)
 
-        w_signal_only_invsqrt = tf.where(y[:, 0]==0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w))
-        w_signal_only_invsqrt *= tf.cast(X[:, 0]!=0, tf.float32)
+        w_signal_only_invsqrt = tf.where(y[:, 0] == 0, 0.0, tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w))
+        w_signal_only_invsqrt *= tf.cast(X[:, 0] != 0, tf.float32)
 
         weight_d = {
             "none": w_none,
             "signal_only": w_signal_only,
             "signal_only_inverse_sqrt": w_signal_only_invsqrt,
-            "inverse_sqrt": w_invsqrt
+            "inverse_sqrt": w_invsqrt,
         }
 
         ret_w = {}
         for loss_component, weight_type in config["sample_weights"].items():
             ret_w[loss_component] = weight_d[weight_type]
 
-        return X,y,ret_w
+        return X, y, ret_w
+
     return weight_func
 
 
 def targets_multi_output(num_output_classes):
     def func(X, y, w):
 
-        msk = tf.expand_dims(tf.cast(y[:, :, 0]!=0, tf.float32), axis=-1)
+        msk = tf.expand_dims(tf.cast(y[:, :, 0] != 0, tf.float32), axis=-1)
         return (
             X,
             {
                 "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),
-                "charge": y[:, :, 1:2]*msk,
-                "pt": y[:, :, 2:3]*msk,
-                "eta": y[:, :, 3:4]*msk,
-                "sin_phi": y[:, :, 4:5]*msk,
-                "cos_phi": y[:, :, 5:6]*msk,
-                "energy": y[:, :, 6:7]*msk,
+                "charge": y[:, :, 1:2] * msk,
+                "pt": y[:, :, 2:3] * msk,
+                "eta": y[:, :, 3:4] * msk,
+                "sin_phi": y[:, :, 4:5] * msk,
+                "cos_phi": y[:, :, 5:6] * msk,
+                "energy": y[:, :, 6:7] * msk,
             },
             w,
         )
 
     return func
 
-def get_dataset_def(config):
-    cds = config["dataset"]
-
-    return Dataset(
-        num_input_features=int(cds["num_input_features"]),
-        num_output_features=int(cds["num_output_features"]),
-        padded_num_elem_size=int(cds["padded_num_elem_size"]),
-        schema=cds["schema"],
-    )
-
-
-def get_train_val_datasets(config, global_batch_size, n_train, n_test, repeat=True):
-    dataset_def = get_dataset_def(config)
-
-    tfr_files = sorted(glob.glob(dataset_def.processed_path))
-    if len(tfr_files) == 0:
-        raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
-
-    random.shuffle(tfr_files)
-    dataset = tf.data.TFRecordDataset(tfr_files).map(
-        dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE
-    )
-
-    # Due to TFRecords format, the length of the dataset is not known beforehand
-    num_events = 0
-    for _ in dataset:
-        num_events += 1
-    print("dataset loaded, len={}".format(num_events))
-
-    weight_func = make_weight_function(config)
-    assert(n_train + n_test <= num_events)
-
-    # Padded shapes
-    ps = (
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
-        {
-            "cls": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "charge": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "energy": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "pt": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "eta": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "sin_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-            "cos_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
-        }
-    )
-
-    ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-
-    if config["setup"]["multi_output"]:
-        dataset_transform = targets_multi_output(config["dataset"]["num_output_classes"])
-        ds_train = ds_train.map(dataset_transform)
-        ds_test = ds_test.map(dataset_transform)
-    else:
-        dataset_transform = None
-
-    return ds_train, ds_test, dataset_transform
-
-def prepare_val_data(config, dataset_def, single_file=False):
-    if single_file:
-        val_filelist = dataset_def.val_filelist[:1]
-    else:
-        val_filelist = dataset_def.val_filelist
-    if config["setup"]["num_val_files"] > 0:
-        val_filelist = val_filelist[: config["setup"]["num_val_files"]]
-
-    Xs = []
-    ygens = []
-    ycands = []
-    for fi in tqdm(val_filelist, desc="Preparing validation data"):
-        X, ygen, ycand = dataset_def.prepare_data(fi)
-        Xs.append(np.concatenate(X))
-        ygens.append(np.concatenate(ygen))
-        ycands.append(np.concatenate(ycand))
-
-    assert(len(Xs) > 0, "Xs is empty")
-    X_val = np.concatenate(Xs)
-    ygen_val = np.concatenate(ygens)
-    ycand_val = np.concatenate(ycands)
-
-    return X_val, ygen_val, ycand_val
-
 
 def get_heptfds_dataset(dataset_name, config, num_gpus, split, num_events=None, supervised=True):
     cds = config["dataset"]
 
-    if cds['schema'] == "cms":
+    if cds["schema"] == "cms":
         dsf = CMSDatasetFactory(config)
-    elif cds['schema'] == "delphes":
+    elif cds["schema"] == "delphes":
         dsf = DelphesDatasetFactory(config)
     else:
         raise ValueError("Only supported datasets are 'cms' and 'delphes'.")
@@ -407,6 +329,7 @@ def get_heptfds_dataset(dataset_name, config, num_gpus, split, num_events=None,
 
     return ds, ds_info
 
+
 def load_and_interleave(dataset_names, config, num_gpus, split, batch_size):
     datasets = []
     steps = []
@@ -415,17 +338,17 @@ def load_and_interleave(dataset_names, config, num_gpus, split, batch_size):
         ds, _ = get_heptfds_dataset(ds_name, config, num_gpus, split)
         num_steps = ds.cardinality().numpy()
         total_num_steps += num_steps
-        assert(num_steps > 0)
+        assert num_steps > 0
         print("Loaded {}:{} with {} steps".format(ds_name, split, num_steps))
 
         datasets.append(ds)
         steps.append(num_steps)
 
-    #Now interleave elements from the datasets randomly
+    # Now interleave elements from the datasets randomly
     ids = 0
     indices = []
     for ds, num_steps in zip(datasets, steps):
-        indices += num_steps*[ids]
+        indices += num_steps * [ids]
         ids += 1
     indices = np.array(indices, np.int64)
     np.random.shuffle(indices)
@@ -435,18 +358,19 @@ def load_and_interleave(dataset_names, config, num_gpus, split, batch_size):
     ds = tf.data.experimental.choose_from_datasets(datasets, choice_dataset)
     bs = batch_size
     if not config["setup"]["horovod_enabled"]:
-        if num_gpus>1:
-            bs = bs*num_gpus
+        if num_gpus > 1:
+            bs = bs * num_gpus
     ds = ds.batch(bs)
 
     total_num_steps = total_num_steps // bs
-    #num_steps = 0
-    #for _ in ds:
+    # num_steps = 0
+    # for _ in ds:
     #    num_steps += 1
-    #assert(total_num_steps == num_steps)
+    # assert(total_num_steps == num_steps)
     return ds, total_num_steps
 
-#Load multiple datasets and mix them together
+
+# Load multiple datasets and mix them together
 def get_datasets(datasets_to_interleave, config, num_gpus, split):
     datasets = []
     steps = []
@@ -455,16 +379,18 @@ def get_datasets(datasets_to_interleave, config, num_gpus, split):
         if ds_conf["datasets"] is None:
             logging.warning("No datasets in {} list.".format(joint_dataset_name))
         else:
-            interleaved_ds, num_steps = load_and_interleave(ds_conf["datasets"], config, num_gpus, split, ds_conf["batch_per_gpu"])
+            interleaved_ds, num_steps = load_and_interleave(
+                ds_conf["datasets"], config, num_gpus, split, ds_conf["batch_per_gpu"]
+            )
             print("Interleaved joint dataset {} with {} steps".format(joint_dataset_name, num_steps))
             datasets.append(interleaved_ds)
             steps.append(num_steps)
-    
+
     ids = 0
     indices = []
     total_num_steps = 0
     for ds, num_steps in zip(datasets, steps):
-        indices += num_steps*[ids]
+        indices += num_steps * [ids]
         total_num_steps += num_steps
         ids += 1
     indices = np.array(indices, np.int64)
@@ -472,14 +398,15 @@ def get_datasets(datasets_to_interleave, config, num_gpus, split):
 
     choice_dataset = tf.data.Dataset.from_tensor_slices(indices)
     ds = tf.data.experimental.choose_from_datasets(datasets, choice_dataset)
-    #num_steps = 0
-    #for elem in ds:
+    # num_steps = 0
+    # for elem in ds:
     #    num_steps += 1
-    #assert(total_num_steps == num_steps)
+    # assert(total_num_steps == num_steps)
 
     print("Final dataset with {} steps".format(total_num_steps))
     return ds, total_num_steps
 
+
 def set_config_loss(config, trainable):
     if trainable == "classification":
         config["dataset"]["pt_loss_coef"] = 0.0
@@ -501,7 +428,9 @@ def set_config_loss(config, trainable):
 
 def get_class_loss(config):
     if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
-        cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=config["setup"].get("classification_label_smoothing", 0.0))
+        cls_loss = tf.keras.losses.CategoricalCrossentropy(
+            from_logits=False, label_smoothing=config["setup"].get("classification_label_smoothing", 0.0)
+        )
     elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
         cls_loss = tfa.losses.sigmoid_focal_crossentropy
     else:
@@ -515,28 +444,30 @@ def get_loss_from_params(input_dict):
     loss_cls = getattr(tf.keras.losses, loss_type)
     return loss_cls(**input_dict)
 
-#batched version of https://github.com/VinAIResearch/DSW/blob/master/gsw.py#L19
+
+# batched version of https://github.com/VinAIResearch/DSW/blob/master/gsw.py#L19
 @tf.function
 def sliced_wasserstein_loss(y_true, y_pred, num_projections=1000):
-    
-    #take everything but the jet_idx
+
+    # take everything but the jet_idx
     y_true = y_true[..., :5]
     y_pred = y_pred[..., :5]
 
-    #create normalized random basis vectors
+    # create normalized random basis vectors
     theta = tf.random.normal((num_projections, y_true.shape[-1]))
     theta = theta / tf.sqrt(tf.reduce_sum(theta**2, axis=1, keepdims=True))
 
-    #project the features with the random basis
+    # project the features with the random basis
     A = tf.linalg.matmul(y_true, theta, False, True)
     B = tf.linalg.matmul(y_pred, theta, False, True)
 
     A_sorted = tf.sort(A, axis=-2)
     B_sorted = tf.sort(B, axis=-2)
 
-    ret = tf.math.sqrt(tf.reduce_sum(tf.math.pow(A_sorted - B_sorted, 2), axis=[-1,-2]))
+    ret = tf.math.sqrt(tf.reduce_sum(tf.math.pow(A_sorted - B_sorted, 2), axis=[-1, -2]))
     return ret
 
+
 @tf.function
 def hist_loss_2d(y_true, y_pred):
 
@@ -548,44 +479,48 @@ def hist_loss_2d(y_true, y_pred):
     pt_true = y_true[..., 0]
     pt_pred = y_pred[..., 0]
 
-    px_true = pt_true*y_true[..., 4]
-    py_true = pt_true*y_true[..., 3]
-    px_pred = pt_pred*y_pred[..., 4]
-    py_pred = pt_pred*y_pred[..., 3]
+    px_true = pt_true * y_true[..., 4]
+    py_true = pt_true * y_true[..., 3]
+    px_pred = pt_pred * y_pred[..., 4]
+    py_pred = pt_pred * y_pred[..., 3]
 
     pt_hist_true = batched_histogram_2d(
-        eta_true,
-        phi_true,
-        px_true,
-        py_true,
-        tf.cast([-6.0,6.0], tf.float32), tf.cast([-4.0,4.0], tf.float32), 20
+        eta_true, phi_true, px_true, py_true, tf.cast([-6.0, 6.0], tf.float32), tf.cast([-4.0, 4.0], tf.float32), 20
     )
 
     pt_hist_pred = batched_histogram_2d(
-        eta_pred,
-        phi_pred,
-        px_pred,
-        py_pred,
-        tf.cast([-6.0,6.0], tf.float32), tf.cast([-4.0,4.0], tf.float32), 20
+        eta_pred, phi_pred, px_pred, py_pred, tf.cast([-6.0, 6.0], tf.float32), tf.cast([-4.0, 4.0], tf.float32), 20
     )
 
-    mse = tf.math.sqrt(tf.reduce_mean((pt_hist_true-pt_hist_pred)**2, axis=[-1,-2]))
+    mse = tf.math.sqrt(tf.reduce_mean((pt_hist_true - pt_hist_pred) ** 2, axis=[-1, -2]))
     return mse
 
 
 @tf.function
 def jet_reco(px, py, jet_idx, max_jets):
 
-    tf.debugging.assert_shapes([
-        (px, ('N')),
-        (py, ('N')),
-        (jet_idx, ('N')),
-    ])
+    tf.debugging.assert_shapes(
+        [
+            (px, ("N")),
+            (py, ("N")),
+            (jet_idx, ("N")),
+        ]
+    )
 
     jet_idx_capped = tf.where(jet_idx <= max_jets, jet_idx, 0)
 
-    jet_px = tf.zeros([max_jets, ], dtype=px.dtype)
-    jet_py = tf.zeros([max_jets, ], dtype=py.dtype)
+    jet_px = tf.zeros(
+        [
+            max_jets,
+        ],
+        dtype=px.dtype,
+    )
+    jet_py = tf.zeros(
+        [
+            max_jets,
+        ],
+        dtype=py.dtype,
+    )
 
     jet_px_new = tf.tensor_scatter_nd_add(jet_px, indices=tf.expand_dims(jet_idx_capped, axis=-1), updates=px)
     jet_py_new = tf.tensor_scatter_nd_add(jet_py, indices=tf.expand_dims(jet_idx_capped, axis=-1), updates=py)
@@ -597,17 +532,26 @@ def jet_reco(px, py, jet_idx, max_jets):
 
 @tf.function
 def batched_jet_reco(px, py, jet_idx, max_jets):
-    tf.debugging.assert_shapes([
-        (px, ('B', 'N')),
-        (py, ('B', 'N')),
-        (jet_idx, ('B', 'N')),
-    ])
+    tf.debugging.assert_shapes(
+        [
+            (px, ("B", "N")),
+            (py, ("B", "N")),
+            (jet_idx, ("B", "N")),
+        ]
+    )
 
     return tf.map_fn(
-        lambda a: jet_reco(a[0], a[1], a[2], max_jets), (px, py, jet_idx),
-        fn_output_signature=tf.TensorSpec([max_jets, ], dtype=tf.float32)
+        lambda a: jet_reco(a[0], a[1], a[2], max_jets),
+        (px, py, jet_idx),
+        fn_output_signature=tf.TensorSpec(
+            [
+                max_jets,
+            ],
+            dtype=tf.float32,
+        ),
     )
 
+
 @tf.function
 def gen_jet_loss(y_true, y_pred):
     y = {}
@@ -618,11 +562,11 @@ def gen_jet_loss(y_true, y_pred):
     max_jets = 201
     jet_idx = tf.cast(y["true"][..., 5], dtype=tf.int32)
     for typ in ["true", "pred"]:
-        px = y[typ][..., 0]*y[typ][..., 4]
-        py = y[typ][..., 0]*y[typ][..., 3]
+        px = y[typ][..., 0] * y[typ][..., 4]
+        py = y[typ][..., 0] * y[typ][..., 3]
         jet_pt[typ] = batched_jet_reco(px, py, jet_idx, max_jets)
 
-    mse = tf.math.sqrt(tf.reduce_mean((jet_pt['true']-jet_pt['pred'])**2, axis=[-1,-2]))
+    mse = tf.math.sqrt(tf.reduce_mean((jet_pt["true"] - jet_pt["pred"]) ** 2, axis=[-1, -2]))
     return mse
 
 
diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml
index 46c61994b..ea66f094f 100644
--- a/parameters/cms-gen.yaml
+++ b/parameters/cms-gen.yaml
@@ -141,10 +141,6 @@ parameters:
     regression_use_classification: yes
     dropout: 0.0
 
-    pt_skip_gate: no
-    eta_skip_gate: yes
-    phi_skip_gate: yes
-
     id_dim_decrease: yes
     charge_dim_decrease: yes
     pt_dim_decrease: yes
@@ -221,7 +217,7 @@ raytune:
     n_random_steps: 10
 
 train_test_datasets:
-  physical: 
+  physical:
     batch_per_gpu: 5
     datasets:
       - cms_pf_ttbar
@@ -232,7 +228,7 @@ train_test_datasets:
 validation_datasets:
   - cms_pf_ttbar
 
-datasets: 
+datasets:
   cms_pf_ttbar:
     version: 1.4.0
     data_dir:
diff --git a/parameters/cms.yaml b/parameters/cms.yaml
index ff5ef7cac..a343a367c 100644
--- a/parameters/cms.yaml
+++ b/parameters/cms.yaml
@@ -124,10 +124,6 @@ parameters:
     regression_use_classification: yes
     dropout: 0.016312  # Set to 0 in future training
 
-    pt_skip_gate: no
-    eta_skip_gate: yes
-    phi_skip_gate: yes
-
     id_dim_decrease: yes
     charge_dim_decrease: yes
     pt_dim_decrease: yes
@@ -205,7 +201,7 @@ raytune:
     n_random_steps: 10
 
 train_test_datasets:
-  physical: 
+  physical:
     batch_per_gpu: 32
     datasets:
       - cms_pf_ttbar
@@ -216,7 +212,7 @@ train_test_datasets:
 validation_datasets:
  - cms_pf_ttbar
 
-datasets: 
+datasets:
   cms_pf_ttbar:
     version: 1.4.0
     data_dir:
diff --git a/parameters/delphes.yaml b/parameters/delphes.yaml
index 54effde2b..a15b5ab2f 100644
--- a/parameters/delphes.yaml
+++ b/parameters/delphes.yaml
@@ -7,7 +7,7 @@ dataset:
   num_output_features: 7
   #(none=0, track=1, cluster=2)
   num_input_classes: 3
-  #(none=0, charged hadron=1, neutral hadron=2, photon=3, electron=4, muon=5) 
+  #(none=0, charged hadron=1, neutral hadron=2, photon=3, electron=4, muon=5)
   num_output_classes: 6
   num_momentum_outputs: 5
   padded_num_elem_size: 6400
@@ -126,10 +126,6 @@ parameters:
     regression_use_classification: yes
     dropout: 0.0
 
-    pt_skip_gate: yes
-    eta_skip_gate: yes
-    phi_skip_gate: yes
-
     id_dim_decrease: yes
     charge_dim_decrease: yes
     pt_dim_decrease: yes
@@ -160,11 +156,6 @@ timing:
   num_ev: 100
   num_iter: 3
 
-exponentialdecay:
-  decay_steps: 10000
-  decay_rate: 0.99
-  staircase: yes
-
 callbacks:
   checkpoint:
     monitor: "val_loss"
@@ -210,7 +201,7 @@ raytune:
     n_random_steps: 10
 
 train_test_datasets:
-  delphes: 
+  delphes:
     batch_per_gpu: 5
     datasets:
       - delphes_pf
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..8e1cc106e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,35 @@
+awkward
+boost_histogram
+click
+fastjet
+keras
+keras-tuner
+matplotlib
+mplhep
+networkx
+nevergrad
+notebook
+onnxruntime
+pandas
+papermill
+pre-commit
+pyarrow
+ray[default]==1.6.0
+ray[tune]==1.6.0
+scikit-optimize
+scipy
+seaborn
+setGPU
+sklearn
+tensorflow==2.9
+tensorflow-addons
+tensorflow-datasets
+tensorflow-estimator
+tensorflow-probability
+tensorflow-text
+tf-models-official
+tf2onnx
+tqdm
+uproot
+vector
+zenodo_get
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
index 722c7f469..02e9e9c0e 100755
--- a/scripts/local_test_cms_pipeline.sh
+++ b/scripts/local_test_cms_pipeline.sh
@@ -33,7 +33,7 @@ python mlpf/pipeline.py train -c parameters/cms-gen.yaml --nepochs 1 --customize
 ls ./experiments/cms*/weights/
 
 #Generate the pred.npz file of predictions
-python mlpf/pipeline.py evaluate --customize pipeline_test -t ./experiments/cms* -w ./experiments/cms*/weights/weights-01-*.hdf5
+python mlpf/pipeline.py evaluate --customize pipeline_test --nevents 10 -t ./experiments/cms* -w ./experiments/cms*/weights/weights-01-*.hdf5
 
 #Evaluate the notebook
 papermill --inject-output-path --log-output -p path ./experiments/cms*/evaluation/epoch_1/cms_pf_ttbar/ notebooks/cms-mlpf.ipynb ./out.ipynb
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
index 132facf3d..e95b78c02 100755
--- a/scripts/local_test_delphes_pipeline.sh
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -19,4 +19,4 @@ python mlpf/pipeline.py train -c parameters/delphes.yaml --nepochs 1 --ntrain 5
 ls ./experiments/delphes_*/weights/
 
 #Generate the pred.npz file of predictions
-python mlpf/pipeline.py evaluate -t ./experiments/delphes_*
+python mlpf/pipeline.py evaluate --nevents 10 -t ./experiments/delphes_*