From b7f59ea966af78cb67eaecaeb8da0adb320d2f7d Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 24 Jun 2021 22:23:07 +0200
Subject: [PATCH 01/23] feat: OneCycle learning rate scheduler

This commit also includes
 - Custom tensorboard callback logging learning rate & momentum
 - A utils.py file collecting utilities used in more than one file
 - Clean-up of how output files are organized
 - Configuration files using the OneCycle scheduler
---
 mlpf/tfmodel/callbacks.py                     |  76 +++++++++
 mlpf/tfmodel/model_setup.py                   |  59 +++----
 mlpf/tfmodel/onecycle_scheduler.py            | 144 ++++++++++++++++++
 mlpf/tfmodel/utils.py                         | 101 ++++++++++++
 parameters/cms-gnn-dense-onecycle.yaml        |  79 ++++++++++
 parameters/delphes-gnn-skipconn-onecycle.yaml |  79 ++++++++++
 6 files changed, 512 insertions(+), 26 deletions(-)
 create mode 100644 mlpf/tfmodel/callbacks.py
 create mode 100644 mlpf/tfmodel/onecycle_scheduler.py
 create mode 100644 mlpf/tfmodel/utils.py
 create mode 100644 parameters/cms-gnn-dense-onecycle.yaml
 create mode 100644 parameters/delphes-gnn-skipconn-onecycle.yaml

diff --git a/mlpf/tfmodel/callbacks.py b/mlpf/tfmodel/callbacks.py
new file mode 100644
index 000000000..6edfddcda
--- /dev/null
+++ b/mlpf/tfmodel/callbacks.py
@@ -0,0 +1,76 @@
+import pickle
+import tensorflow as tf
+from tensorflow.keras.callbacks import TensorBoard
+from tensorflow.keras.callbacks import ModelCheckpoint
+from pathlib import Path
+import numpy as np
+
+
+class CustomTensorBoard(TensorBoard):
+    """
+    Extends tensorflow.keras.callbacks TensorBoard
+
+    Custom tensorboard class to make logging of learning rate possible when using
+    keras.optimizers.schedules.LearningRateSchedule.
+    See https://github.com/tensorflow/tensorflow/pull/37552
+
+    Also logs momemtum for supported optimizers that use momemtum.
+    """
+
+    def _collect_learning_rate(self, logs):
+        logs = logs or {}
+        lr_schedule = getattr(self.model.optimizer, "lr", None)
+        if isinstance(lr_schedule, tf.keras.optimizers.schedules.LearningRateSchedule):
+            logs["learning_rate"] = np.float64(tf.keras.backend.get_value(lr_schedule(self.model.optimizer.iterations)))
+        else:
+            logs.update({"learning_rate": np.float64(tf.keras.backend.eval(self.model.optimizer.lr))})
+
+        # Log momentum if the optimizer has it
+        try:
+            logs.update({"momentum": np.float64(tf.keras.backend.eval(self.model.optimizer.momentum))})
+        except AttributeError:
+            pass
+
+        # In Adam, the momentum parameter is called beta_1
+        if isinstance(self.model.optimizer, tf.keras.optimizers.Adam):
+            logs.update({"adam_beta_1": np.float64(tf.keras.backend.eval(self.model.optimizer.beta_1))})
+
+        return logs
+
+    def on_epoch_end(self, epoch, logs):
+        logs = logs or {}
+        logs.update(self._collect_learning_rate(logs))
+        super().on_epoch_end(epoch, logs)
+
+    def on_train_batch_end(self, batch, logs):
+        logs = logs or {}
+        if isinstance(self.update_freq, int) and batch % self.update_freq == 0:
+            logs.update(self._collect_learning_rate(logs))
+        super().on_train_batch_end(batch, logs)
+
+
+class CustomModelCheckpoint(ModelCheckpoint):
+    """Extends tensorflow.keras.callbacks.ModelCheckpoint to also save optimizer"""
+
+    def __init__(self, *args, **kwargs):
+        # Added arguments
+        self.optimizer_to_save = kwargs.pop("optimizer_to_save")
+        self.optimizer_filepath = kwargs.pop("optimizer_save_filepath")
+        super().__init__(*args, **kwargs)
+
+        Path(self.filepath).parent.mkdir(parents=True, exist_ok=True)
+
+    def on_epoch_end(self, epoch, logs=None):
+        super().on_epoch_end(epoch, logs)
+
+        # If a checkpoint was saved, also save the optimizer
+        filepath = str(self.optimizer_filepath).format(epoch=epoch + 1, **logs)
+        if self.epochs_since_last_save == 0:
+            if self.save_best_only:
+                current = logs.get(self.monitor)
+                if current == self.best:
+                    with open(filepath, "wb") as f:
+                        pickle.dump(self.optimizer_to_save, f)
+            else:
+                with open(filepath, "wb") as f:
+                    pickle.dump(self.optimizer_to_save, f)
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 7a6e43231..77cc94a78 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -22,6 +22,12 @@
 import json
 import random
 import platform
+from tqdm import tqdm
+from pathlib import Path
+from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
+from tfmodel.callbacks import CustomTensorBoard
+from tfmodel.utils import get_lr_schedule, get_weights_func
+
 
 def plot_confusion_matrix(cm):
     fig = plt.figure(figsize=(5,5))
@@ -174,8 +180,8 @@ def on_epoch_end(self, epoch, logs=None):
 
 def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output_classes):
     callbacks = []
-    tb = tf.keras.callbacks.TensorBoard(
-        log_dir=outdir, histogram_freq=1, write_graph=False, write_images=False,
+    tb = CustomTensorBoard(
+        log_dir=outdir + "/tensorboard_logs", histogram_freq=1, write_graph=False, write_images=False,
         update_freq='epoch',
         #profile_batch=(10,90),
         profile_batch=0,
@@ -186,15 +192,20 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     terminate_cb = tf.keras.callbacks.TerminateOnNaN()
     callbacks += [terminate_cb]
 
+    cp_dir = Path(outdir) / "weights"
+    cp_dir.mkdir(parents=True, exist_ok=True)
     cp_callback = tf.keras.callbacks.ModelCheckpoint(
-        filepath=outdir + "/weights-{epoch:02d}-{val_loss:.6f}.hdf5",
+        filepath=str(cp_dir / "weights-{epoch:02d}-{val_loss:.6f}.hdf5"),
         save_weights_only=True,
         verbose=0
     )
     cp_callback.set_model(model)
     callbacks += [cp_callback]
 
-    cb = CustomCallback(outdir, X_val, y_val, dataset_transform, num_output_classes)
+    history_path = Path(outdir) / "history"
+    history_path.mkdir()
+    history_path = str(history_path)
+    cb = CustomCallback(history_path, X_val, y_val, dataset_transform, num_output_classes)
     cb.set_model(model)
 
     callbacks += [cb]
@@ -214,22 +225,6 @@ def get_rundir(base='experiments'):
     logdir = 'run_%02d' % run_number
     return '{}/{}'.format(base, logdir)
 
-def compute_weights_invsqrt(X, y, w):
-    wn = tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w)
-    wn *= tf.cast(X[:, 0]!=0, tf.float32)
-    #wn /= tf.reduce_sum(wn)
-    return X, y, wn
-
-def compute_weights_none(X, y, w):
-    wn = tf.ones_like(w)
-    wn *= tf.cast(X[:, 0]!=0, tf.float32)
-    return X, y, wn
-
-weight_functions = {
-    "inverse_sqrt": compute_weights_invsqrt,
-    "none": compute_weights_none,
-}
-
 def scale_outputs(X,y,w):
     ynew = y-out_m
     ynew = ynew/out_s
@@ -344,12 +339,15 @@ def make_dense(config, dtype):
 
 def eval_model(X, ygen, ycand, model, config, outdir, global_batch_size):
     import scipy
-    for ibatch in range(X.shape[0]//global_batch_size):
+    for ibatch in tqdm(range(X.shape[0]//global_batch_size), desc="Evaluating model"):
         nb1 = ibatch*global_batch_size
         nb2 = (ibatch+1)*global_batch_size
 
         y_pred = model.predict(X[nb1:nb2], batch_size=global_batch_size)
-        y_pred_raw_ids = y_pred[:, :, :config["dataset"]["num_output_classes"]]
+        if type(y_pred) is dict:  # for e.g. when the model is multi_output
+            y_pred_raw_ids = y_pred['cls']
+        else:
+            y_pred_raw_ids = y_pred[:, :, :config["dataset"]["num_output_classes"]]
         
         #softmax score must be over a threshold 0.6 to call it a particle (prefer low fake rate to high efficiency)
         # y_pred_id_sm = scipy.special.softmax(y_pred_raw_ids, axis=-1)
@@ -364,7 +362,12 @@ def eval_model(X, ygen, ycand, model, config, outdir, global_batch_size):
 
         y_pred_id = np.argmax(y_pred_raw_ids, axis=-1)
 
-        y_pred_id = np.concatenate([np.expand_dims(y_pred_id, axis=-1), y_pred[:, :, config["dataset"]["num_output_classes"]:]], axis=-1)
+        if type(y_pred) is dict:
+            y_pred_rest = np.concatenate([y_pred["charge"], y_pred["pt"], y_pred["eta"], y_pred["sin_phi"], y_pred["cos_phi"], y_pred["energy"]], axis=-1)
+            y_pred_id = np.concatenate([np.expand_dims(y_pred_id, axis=-1), y_pred_rest], axis=-1)
+        else:
+            y_pred_id = np.concatenate([np.expand_dims(y_pred_id, axis=-1), y_pred[:, :, config["dataset"]["num_output_classes"]:]], axis=-1)
+
         np_outfile = "{}/pred_{}.npz".format(outdir, ibatch)
         np.savez(
             np_outfile,
@@ -501,7 +504,7 @@ def main(args, yaml_path, config):
         n_test = args.ntest
 
     n_epochs = config['setup']['num_epochs']
-    weight_func = weight_functions[config['setup']['sample_weights']]
+    weight_func = get_weights_func(config)
     assert(n_train + n_test <= num_events)
 
     ps = (
@@ -584,6 +587,8 @@ def main(args, yaml_path, config):
             decay_rate=0.99,
             staircase=True
         )
+        total_steps = n_epochs * n_train // global_batch_size
+        lr_schedule, optim_callbacks = get_lr_schedule(config, actual_lr, steps=total_steps)
         opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
         if config['setup']['dtype'] == 'float16':
             model_dtype = tf.dtypes.float16
@@ -666,14 +671,16 @@ def main(args, yaml_path, config):
                     model, outdir, X_val[:config['setup']['batch_size']], ycand_val[:config['setup']['batch_size']],
                     dataset_transform, config["dataset"]["num_output_classes"]
                 )
-                callbacks.append(LearningRateLoggingCallback())
+                callbacks.append(optim_callbacks)
 
                 fit_result = model.fit(
                     ds_train_r, validation_data=ds_test_r, epochs=initial_epoch+n_epochs, callbacks=callbacks,
                     steps_per_epoch=n_train//global_batch_size, validation_steps=n_test//global_batch_size,
                     initial_epoch=initial_epoch
                 )
-                with open("{}/history.json".format(outdir), "w") as fi:
+                history_path = Path(outdir) / "history"
+                history_path = str(history_path)
+                with open("{}/history.json".format(history_path), "w") as fi:
                     json.dump(fit_result.history, fi)
                 model.save(outdir + "/model_full", save_format="tf")
             
diff --git a/mlpf/tfmodel/onecycle_scheduler.py b/mlpf/tfmodel/onecycle_scheduler.py
new file mode 100644
index 000000000..35d2a240f
--- /dev/null
+++ b/mlpf/tfmodel/onecycle_scheduler.py
@@ -0,0 +1,144 @@
+import numpy as np
+import logging
+
+import tensorflow as tf
+from tensorflow.python.framework import ops
+from tensorflow.keras.optimizers.schedules import LearningRateSchedule
+from tensorflow.keras.callbacks import Callback
+
+logging.getLogger("tensorflow").setLevel(logging.ERROR)
+
+
+class CosineAnnealer:
+    def __init__(self, start, end, steps):
+        self.start = start
+        self.end = end
+        self.steps = steps
+        self.n = 0
+
+    def step(self):
+        cos = np.cos(np.pi * (self.n / self.steps)) + 1
+        self.n += 1
+        return self.end + (self.start - self.end) / 2.0 * cos
+
+
+class OneCycleScheduler(LearningRateSchedule):
+    """`Callback` that schedules the learning rate on a 1cycle policy as per Leslie Smith's paper
+    (https://arxiv.org/pdf/1803.09820.pdf).
+
+    The implementation adopts additional improvements as per the fastai library:
+    https://docs.fast.ai/callbacks.one_cycle.html, where only two phases are used and the adaptation is done using
+    cosine annealing. In the warm-up phase the LR increases from `lr_max / div_factor` to `lr_max` and momentum
+    decreases from `mom_max` to `mom_min`. In the second phase the LR decreases from `lr_max` to `lr_max / final_div`
+    and momemtum from `mom_max` to `mom_min`. By default the phases are not of equal length, with the warm-up phase
+    controlled by the parameter `warmup_ratio`.
+
+    NOTE: The momentum is not controlled through this class. This class is intended to be used together with the
+    `MomentumOneCycleScheduler` callback defined below.
+    """
+
+    def __init__(
+        self,
+        lr_max,
+        steps,
+        mom_min=0.85,
+        mom_max=0.95,
+        warmup_ratio=0.3,
+        div_factor=25.0,
+        final_div=100000.0,
+        name=None,
+    ):
+        super(OneCycleScheduler, self).__init__()
+        lr_min = lr_max / div_factor
+
+        if final_div is None:
+            final_lr = lr_max / (div_factor * 1e4)
+        else:
+            final_lr = lr_max / (final_div)
+
+        phase_1_steps = steps * warmup_ratio
+        phase_2_steps = steps - phase_1_steps
+
+        self.lr_max = lr_max
+        self.steps = steps
+        self.mom_min = mom_min
+        self.mom_max = mom_max
+        self.warmup_ratio = warmup_ratio
+        self.div_factor = div_factor
+        self.final_div = final_div
+        self.name = name
+
+        phases = [CosineAnnealer(lr_min, lr_max, phase_1_steps), CosineAnnealer(lr_max, final_lr, phase_2_steps)]
+
+        step = 0
+        phase = 0
+        full_lr_schedule = np.zeros(int(steps))
+        for ii in np.arange(np.floor(steps), dtype=int):
+            step += 1
+            if step >= phase_1_steps:
+                phase = 1
+            full_lr_schedule[ii] = phases[phase].step()
+
+        self.full_lr_schedule = tf.convert_to_tensor(full_lr_schedule)
+
+    def __call__(self, step):
+        with ops.name_scope(self.name or "OneCycleScheduler"):
+            return self.full_lr_schedule[tf.cast(step, "int32") - 1]
+
+    def get_config(self):
+        return {
+            "lr_max": self.lr_max,
+            "steps": self.steps,
+            "mom_min": self.mom_min,
+            "mom_max": self.mom_max,
+            "warmup_ratio": self.warmup_ratio,
+            "div_factor": self.div_factor,
+            "final_div": self.final_div,
+            "name": self.name,
+        }
+
+
+class MomentumOneCycleScheduler(Callback):
+    """`Callback` that schedules the momentun according to the 1cycle policy as per Leslie Smith's paper
+    (https://arxiv.org/pdf/1803.09820.pdf).
+    NOTE: This callback only schedules the momentum parameter, not the learning rate. It is intended to be used with the
+    KerasOneCycle learning rate scheduler above or similar.
+    """
+
+    def __init__(self, steps, mom_min=0.85, mom_max=0.95, warmup_ratio=0.3):
+        super(MomentumOneCycleScheduler, self).__init__()
+
+        phase_1_steps = steps * warmup_ratio
+        phase_2_steps = steps - phase_1_steps
+
+        self.phase_1_steps = phase_1_steps
+        self.phase_2_steps = phase_2_steps
+        self.phase = 0
+        self.step = 0
+
+        self.phases = [CosineAnnealer(mom_max, mom_min, phase_1_steps), CosineAnnealer(mom_min, mom_max, phase_2_steps)]
+
+    def on_train_begin(self, logs=None):
+        self.set_momentum(self.mom_schedule().step())
+
+    def on_train_batch_end(self, batch, logs=None):
+        self.step += 1
+        if self.step >= self.phase_1_steps:
+            self.phase = 1
+
+        self.set_momentum(self.mom_schedule().step())
+
+    def set_momentum(self, mom):
+        # In Adam, the momentum parameter is called beta_1
+        if isinstance(self.model.optimizer, tf.keras.optimizers.Adam):
+            tf.keras.backend.set_value(self.model.optimizer.beta_1, mom)
+        # In SDG, the momentum parameter is called momentum
+        elif isinstance(self.model.optimizer, tf.keras.optimizers.SGD):
+            tf.keras.backend.set_value(self.model.optimizer.momentum, mom)
+        else:
+            raise NotImplementedError(
+                "Only SGD and Adam are supported by MomentumOneCycleScheduler: {}".format(type(self.model.optimizer))
+            )
+
+    def mom_schedule(self):
+        return self.phases[self.phase]
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
new file mode 100644
index 000000000..24471504a
--- /dev/null
+++ b/mlpf/tfmodel/utils.py
@@ -0,0 +1,101 @@
+import os
+import yaml
+from pathlib import Path
+import datetime
+import platform
+
+import tensorflow as tf
+
+from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
+
+
+def load_config(config_file_path):
+    with open(config_file_path, "r") as ymlfile:
+        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
+    return cfg
+
+
+def create_experiment_dir(prefix=None, suffix=None):
+    if prefix is None:
+        train_dir = Path("experiments") / datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    else:
+        train_dir = Path("experiments") / (prefix + datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
+
+    if suffix is not None:
+        train_dir = train_dir.with_name(train_dir.name + "." + platform.node())
+
+    train_dir.mkdir(parents=True)
+    return str(train_dir)
+
+
+def get_strategy():
+    global_batch_size = None
+    try:
+        gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
+        num_gpus = len(gpus)
+        print("num_gpus=", num_gpus)
+        if num_gpus > 1:
+            strategy = tf.distribute.MirroredStrategy()
+            global_batch_size = num_gpus * global_batch_size
+        else:
+            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
+    except Exception as e:
+        print("fallback to CPU", e)
+        strategy = tf.distribute.OneDeviceStrategy("cpu")
+        num_gpus = 0
+    return strategy, global_batch_size
+
+
+def get_lr_schedule(config, lr, steps):
+    callbacks = []
+    schedule = config["setup"]["lr_schedule"]
+    if schedule == "onecycle":
+        onecycle_cfg = config["onecycle"]
+        lr_schedule = OneCycleScheduler(
+            lr_max=lr,
+            steps=steps,
+            mom_min=onecycle_cfg["mom_min"],
+            mom_max=onecycle_cfg["mom_max"],
+            warmup_ratio=onecycle_cfg["warmup_ratio"],
+            div_factor=onecycle_cfg["div_factor"],
+            final_div=onecycle_cfg["final_div"],
+        )
+        callbacks.append(
+            MomentumOneCycleScheduler(
+                steps=steps,
+                mom_min=onecycle_cfg["mom_min"],
+                mom_max=onecycle_cfg["mom_max"],
+                warmup_ratio=onecycle_cfg["warmup_ratio"],
+            )
+        )
+    elif schedule == "exponentialdecay":
+        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+            lr,
+            decay_steps=steps,
+            decay_rate=0.99,
+            staircase=True,
+        )
+    return lr_schedule, callbacks
+
+
+def compute_weights_invsqrt(X, y, w):
+    wn = tf.cast(tf.shape(w)[-1], tf.float32) / tf.sqrt(w)
+    wn *= tf.cast(X[:, 0] != 0, tf.float32)
+    # wn /= tf.reduce_sum(wn)
+    return X, y, wn
+
+
+def compute_weights_none(X, y, w):
+    wn = tf.ones_like(w)
+    wn *= tf.cast(X[:, 0] != 0, tf.float32)
+    return X, y, wn
+
+
+def get_weights_func(config):
+    sampling = config["setup"]["sample_weights"]
+    if sampling == "inverse_sqrt":
+        return compute_weights_invsqrt
+    elif sampling == "none":
+        return compute_weights_none
+    else:
+        raise ValueError("Only supported weight samplings are 'inverse_sqrt' and 'none'.")
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
new file mode 100644
index 000000000..92c36bc5f
--- /dev/null
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -0,0 +1,79 @@
+backend: tensorflow
+
+dataset:
+  schema: cms
+  target_particles: cand
+  num_input_features: 15
+  num_output_features: 7
+#       NONE = 0,
+#       TRACK = 1,
+#       PS1 = 2,
+#       PS2 = 3,
+#       ECAL = 4,
+#       HCAL = 5,
+#       GSF = 6,
+#       BREM = 7,
+#       HFEM = 8,
+#       HFHAD = 9,
+#       SC = 10,
+#       HO = 11,
+  num_input_classes: 12
+  #(none=0, ch.had=1, n.had=2, hfem=3, hfhad=4, gamma=5, e=6, mu=7)
+  num_output_classes: 8
+  padded_num_elem_size: 6400
+  #(pt, eta, sin phi, cos phi, E)
+  num_momentum_outputs: 5
+  classification_loss_coef: 1.0
+  charge_loss_coef: 0.1
+  pt_loss_coef: 1.0
+  eta_loss_coef: 0.1
+  sin_phi_loss_coef: 1.0
+  cos_phi_loss_coef: 1.0
+  energy_loss_coef: 0.1
+  raw_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
+  processed_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
+  num_files_per_chunk: 1
+  validation_file_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/val/*.pkl.bz2
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config: all
+  lr: 1e-4
+  batch_size: 16
+  num_events_train: 80000
+  num_events_test: 10000
+  num_epochs: 400
+  num_val_files: 100
+  dtype: float32
+  sample_weights: inverse_sqrt
+  trainable: all
+  classification_loss_type: categorical_cross_entropy  # categorical_cross_entropy, sigmoid_focal_crossentropy
+  lr_schedule: onecycle  # exponentialdecay, onecycle
+
+parameters:
+  model: gnn_dense
+  activation: elu
+  layernorm: no
+  hidden_dim: 256
+  bin_size: 640
+  clip_value_low: 0.0
+  num_conv: 2
+  num_gsl: 2
+  normalize_degrees: yes
+  distance_dim: 128
+  dropout: 0.0
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+onecycle:
+  mom_min: 0.85
+  mom_max: 0.95
+  warmup_ratio: 0.3
+  div_factor: 25.0
+  final_div: 100000.0
\ No newline at end of file
diff --git a/parameters/delphes-gnn-skipconn-onecycle.yaml b/parameters/delphes-gnn-skipconn-onecycle.yaml
new file mode 100644
index 000000000..b398f10c1
--- /dev/null
+++ b/parameters/delphes-gnn-skipconn-onecycle.yaml
@@ -0,0 +1,79 @@
+backend: tensorflow
+
+dataset:
+  schema: delphes
+  target_particles: gen
+  num_input_features: 12
+  num_output_features: 7
+  #(none=0, track=1, cluster=2)
+  num_input_classes: 3
+  num_output_classes: 6
+  num_momentum_outputs: 5
+  padded_num_elem_size: 6400
+  classification_loss_coef: 1.0
+  momentum_loss_coef: 1.0
+  charge_loss_coef: 1.0
+  pt_loss_coef: 1.0
+  eta_loss_coef: 1.0
+  sin_phi_loss_coef: 1.0
+  cos_phi_loss_coef: 1.0
+  energy_loss_coef: 0.001
+  momentum_loss_coefs:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    - 0.001
+  raw_path: ../data/mlpf_zenodo/pythia8_ttbar/raw/*.pkl.bz2
+  processed_path: ../data/mlpf_zenodo/pythia8_ttbar/tfr/*.tfrecords
+  num_files_per_chunk: 5
+  validation_file_path: ../data/mlpf_zenodo/pythia8_qcd/val/*.pkl.bz2
+
+tensorflow:
+  eager: no
+
+setup:
+  train: yes
+  weights:
+  weights_config: all
+  lr: 1e-5
+  batch_size: 16
+  num_events_train: 40000
+  num_events_test: 5000
+  num_epochs: 250
+  num_val_files: -1
+  dtype: float32
+  sample_weights: none
+  trainable: all
+  multi_output: yes
+  classification_loss_type: categorical_cross_entropy
+  lr_schedule: onecycle  # exponentialdecay, onecycle
+
+parameters:
+  model: gnn
+  bin_size: 128
+  num_convs_id: 2
+  num_convs_reg: 2
+  num_hidden_id_enc: 2
+  num_hidden_id_dec: 2
+  num_hidden_reg_enc: 2
+  num_hidden_reg_dec: 2
+  num_neighbors: 16 
+  hidden_dim_id: 256
+  hidden_dim_reg: 256
+  distance_dim: 256
+  dropout: 0.2
+  dist_mult: 1.0
+  activation: elu
+  skip_connection: True
+
+timing:
+  num_ev: 100
+  num_iter: 3
+
+onecycle:
+  mom_min: 0.85
+  mom_max: 0.95
+  warmup_ratio: 0.3
+  div_factor: 25.0
+  final_div: 100000.0
\ No newline at end of file

From cf5f776fff77b053df95dcb8f044451d3a59964d Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 24 Jun 2021 22:32:04 +0200
Subject: [PATCH 02/23] feat: pipeline - my alternative to the launcher

`mlpf/pipeline.py` is the beginning of a `click` based alternative to the
`mlpf/launcher.py`.
---
 mlpf/pipeline.py | 366 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 mlpf/pipeline.py

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
new file mode 100644
index 000000000..e4b85d70c
--- /dev/null
+++ b/mlpf/pipeline.py
@@ -0,0 +1,366 @@
+import sys
+import os
+import yaml
+import json
+import datetime
+import glob
+import random
+import platform
+import numpy as np
+from pathlib import Path
+import click
+from tqdm import tqdm
+
+import tensorflow as tf
+from tensorflow.keras import mixed_precision
+import tensorflow_addons as tfa
+
+from tfmodel.data import Dataset
+from tfmodel.model_setup import (
+    targets_multi_output,
+    make_model,
+    configure_model_weights,
+    LearningRateLoggingCallback,
+    prepare_callbacks,
+    FlattenedCategoricalAccuracy,
+    eval_model,
+    freeze_model,
+)
+
+from tfmodel.utils import (
+    get_lr_schedule,
+    create_experiment_dir,
+    get_strategy,
+    get_weights_func,
+    load_config,
+    compute_weights_invsqrt,
+    compute_weights_none,
+)
+
+from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
+
+
+@click.group()
+@click.help_option("-h", "--help")
+def main():
+    pass
+
+
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
+@click.option("--ntrain", default=None, help="override the number of training events", type=int)
+@click.option("--ntest", default=None, help="override the number of testing events", type=int)
+@click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
+def train(config, weights, ntrain, ntest, recreate):
+    """Train a model defined by config
+    """
+    config_file_stem = Path(config).stem
+    config = load_config(config)
+    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
+
+    cds = config["dataset"]
+    dataset_def = Dataset(
+        num_input_features=int(cds["num_input_features"]),
+        num_output_features=int(cds["num_output_features"]),
+        padded_num_elem_size=int(cds["padded_num_elem_size"]),
+        raw_path=cds.get("raw_path", None),
+        raw_files=cds.get("raw_files", None),
+        processed_path=cds["processed_path"],
+        validation_file_path=cds["validation_file_path"],
+        schema=cds["schema"],
+    )
+
+    global_batch_size = config["setup"]["batch_size"]
+    if "multi_output" in config["setup"]:
+        multi_output = config["setup"]["multi_output"]
+    else:
+        multi_output = True
+        config["setup"]["multi_output"] = True
+    n_train = config["setup"]["num_events_train"]
+    n_test = config["setup"]["num_events_test"]
+    if ntrain:
+        n_train = ntrain
+    if ntest:
+        n_test = ntest
+
+    n_epochs = config["setup"]["num_epochs"]
+    total_steps = n_epochs * n_train // global_batch_size
+
+    tfr_files = sorted(glob.glob(dataset_def.processed_path))
+    if len(tfr_files) == 0:
+        raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
+
+    random.shuffle(tfr_files)
+    dataset = tf.data.TFRecordDataset(tfr_files).map(
+        dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE
+    )
+
+    # Due to TFRecords format, the length of the dataset is not known beforehand
+    num_events = 0
+    for i in dataset:
+        num_events += 1
+    print("dataset loaded, len={}".format(num_events))
+
+    weight_func = get_weights_func(config)
+    assert n_train + n_test <= num_events
+
+    # Padded shapes
+    ps = (
+        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
+        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
+        tf.TensorShape(
+            [
+                dataset_def.padded_num_elem_size,
+            ]
+        ),
+    )
+
+    ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
+    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
+
+    if multi_output:
+        dataset_transform = targets_multi_output(config["dataset"]["num_output_classes"])
+        ds_train = ds_train.map(dataset_transform)
+        ds_test = ds_test.map(dataset_transform)
+    else:
+        dataset_transform = None
+
+    ds_train_r = ds_train.repeat(n_epochs)
+    ds_test_r = ds_test.repeat(n_epochs)
+
+    if weights is None:
+        weights = config["setup"]["weights"]
+
+    if recreate or (weights is None):
+        outdir = create_experiment_dir(prefix=config_file_stem + "_", suffix=platform.node())
+    else:
+        outdir = str(Path(weights).parent)
+
+    # Decide tf.distribute.strategy depending on number of available GPUs
+    strategy, maybe_global_batch_size = get_strategy()
+
+    # If using more than 1 GPU, we scale the batch size by the number of GPUs
+    if maybe_global_batch_size is not None:
+        global_batch_size = maybe_global_batch_size
+    actual_lr = global_batch_size * float(config["setup"]["lr"])
+
+    val_filelist = dataset_def.val_filelist
+    if config["setup"]["num_val_files"] > 0:
+        val_filelist = val_filelist[: config["setup"]["num_val_files"]]
+
+    Xs = []
+    ygens = []
+    ycands = []
+    for fi in tqdm(val_filelist[:1], desc="Preparing validation data"):
+        X, ygen, ycand = dataset_def.prepare_data(fi)
+        Xs.append(np.concatenate(X))
+        ygens.append(np.concatenate(ygen))
+        ycands.append(np.concatenate(ycand))
+
+    assert len(Xs) > 0, "Xs is empty"
+    X_val = np.concatenate(Xs)
+    ygen_val = np.concatenate(ygens)
+    ycand_val = np.concatenate(ycands)
+
+    with strategy.scope():
+        lr_schedule, optim_callbacks = get_lr_schedule(config, lr=actual_lr, steps=total_steps)
+        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+        if config["setup"]["dtype"] == "float16":
+            model_dtype = tf.dtypes.float16
+            policy = mixed_precision.Policy("mixed_float16")
+            mixed_precision.set_global_policy(policy)
+            opt = mixed_precision.LossScaleOptimizer(opt)
+        else:
+            model_dtype = tf.dtypes.float32
+
+        model = make_model(config, model_dtype)
+
+        # Run model once to build the layers
+        print(X_val.shape)
+        model(tf.cast(X_val[:1], model_dtype))
+
+        initial_epoch = 0
+        if weights:
+            # We need to load the weights in the same trainable configuration as the model was set up
+            configure_model_weights(model, config["setup"].get("weights_config", "all"))
+            model.load_weights(weights, by_name=True)
+            initial_epoch = int(weights.split("/")[-1].split("-")[1])
+        model(tf.cast(X_val[:1], model_dtype))
+
+        if config["setup"]["trainable"] == "classification":
+            config["dataset"]["pt_loss_coef"] = 0.0
+            config["dataset"]["eta_loss_coef"] = 0.0
+            config["dataset"]["sin_phi_loss_coef"] = 0.0
+            config["dataset"]["cos_phi_loss_coef"] = 0.0
+            config["dataset"]["energy_loss_coef"] = 0.0
+        elif config["setup"]["trainable"] == "regression":
+            config["dataset"]["classification_loss_coef"] = 0.0
+            config["dataset"]["charge_loss_coef"] = 0.0
+
+        configure_model_weights(model, config["setup"]["trainable"])
+        model(tf.cast(X_val[:1], model_dtype))
+
+        if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
+            cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
+        elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
+            cls_loss = tfa.losses.sigmoid_focal_crossentropy
+        else:
+            raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
+
+        model.compile(
+            loss={
+                "cls": cls_loss,
+                "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
+                "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
+                "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
+                "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
+                "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
+                "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
+            },
+            optimizer=opt,
+            sample_weight_mode="temporal",
+            loss_weights={
+                "cls": config["dataset"]["classification_loss_coef"],
+                "charge": config["dataset"]["charge_loss_coef"],
+                "pt": config["dataset"]["pt_loss_coef"],
+                "eta": config["dataset"]["eta_loss_coef"],
+                "sin_phi": config["dataset"]["sin_phi_loss_coef"],
+                "cos_phi": config["dataset"]["cos_phi_loss_coef"],
+                "energy": config["dataset"]["energy_loss_coef"],
+            },
+            metrics={
+                "cls": [
+                    FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                    FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ]
+            },
+        )
+        model.summary()
+
+        callbacks = prepare_callbacks(
+            model,
+            outdir,
+            X_val[: config["setup"]["batch_size"]],
+            ycand_val[: config["setup"]["batch_size"]],
+            dataset_transform,
+            config["dataset"]["num_output_classes"],
+        )
+        callbacks.append(LearningRateLoggingCallback())
+        callbacks.append(optim_callbacks)
+
+        fit_result = model.fit(
+            ds_train_r,
+            validation_data=ds_test_r,
+            epochs=initial_epoch + n_epochs,
+            callbacks=callbacks,
+            steps_per_epoch=n_train // global_batch_size,
+            validation_steps=n_test // global_batch_size,
+            initial_epoch=initial_epoch,
+        )
+        history_path = Path(outdir) / "history"
+        history_path = str(history_path)
+        with open("{}/history.json".format(history_path), "w") as fi:
+            json.dump(fit_result.history, fi)
+        model.save(outdir + "/model_full", save_format="tf")
+
+        print("Training done.")
+
+        print("Starting evaluation...")
+        eval_dir = Path(outdir) / "evaluation"
+        eval_dir.mkdir()
+        eval_dir = str(eval_dir)
+        # TODO: change to use the evaluate() function below instead of eval_model()
+        eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
+        print("Evaluation done.")
+
+        freeze_model(model, config, outdir)
+
+
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-t", "--train_dir", required=True, help="directory containing a completed training", type=click.Path())
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
+@click.option("-e", "--evaluation_dir", help="force creation of new experiment dir", type=click.Path())
+def evaluate(config, train_dir, weights, evaluation_dir):
+    config = load_config(config)
+    # Switch off multi-output for the evaluation for backwards compatibility
+    config["setup"]["multi_output"] = False
+
+    if evaluation_dir is None:
+        eval_dir = str(Path(train_dir) / "evaluation")
+    else:
+        eval_dir = evaluation_dir
+    Path(eval_dir).mkdir(parents=True, exist_ok=True)
+
+    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
+
+    if weights is None:
+        weights = config["setup"]["weights"]
+
+    if config["setup"]["dtype"] == "float16":
+        model_dtype = tf.dtypes.float16
+        policy = mixed_precision.Policy("mixed_float16")
+        mixed_precision.set_global_policy(policy)
+        opt = mixed_precision.LossScaleOptimizer(opt)
+    else:
+        model_dtype = tf.dtypes.float32
+
+    cds = config["dataset"]
+    dataset_def = Dataset(
+        num_input_features=int(cds["num_input_features"]),
+        num_output_features=int(cds["num_output_features"]),
+        padded_num_elem_size=int(cds["padded_num_elem_size"]),
+        raw_path=cds.get("raw_path", None),
+        raw_files=cds.get("raw_files", None),
+        processed_path=cds["processed_path"],
+        validation_file_path=cds["validation_file_path"],
+        schema=cds["schema"],
+    )
+
+    Xs = []
+    ygens = []
+    ycands = []
+    val_filelist = dataset_def.val_filelist
+    if config["setup"]["num_val_files"] > 0:
+        val_filelist = val_filelist[: config["setup"]["num_val_files"]]
+
+    for fi in tqdm(val_filelist, desc="Preparing validation data"):
+        X, ygen, ycand = dataset_def.prepare_data(fi)
+        Xs.append(np.concatenate(X))
+        ygens.append(np.concatenate(ygen))
+        ycands.append(np.concatenate(ycand))
+    assert len(Xs) > 0
+    X_val = np.concatenate(Xs)
+    ygen_val = np.concatenate(ygens)
+    ycand_val = np.concatenate(ycands)
+
+    global_batch_size = config["setup"]["batch_size"]
+
+    strategy, maybe_global_batch_size = get_strategy()
+    if maybe_global_batch_size is not None:
+        global_batch_size = maybe_global_batch_size
+
+    with strategy.scope():
+
+        model = make_model(config, model_dtype)
+
+        # Evaluate model once to build the layers
+        print(X_val.shape)
+        model(tf.cast(X_val[:1], model_dtype))
+
+        if weights:
+            # need to load the weights in the same trainable configuration as the model was set up
+            configure_model_weights(model, config["setup"].get("weights_config", "all"))
+            model.load_weights(weights, by_name=True)
+        model(tf.cast(X_val[:1], model_dtype))
+
+        model.compile()
+        eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
+        freeze_model(model, config, eval_dir)
+
+
+if __name__ == "__main__":
+    main()

From 636dab601de1ee10eb0556d9211769de069fcb24 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 28 Jun 2021 11:24:26 +0200
Subject: [PATCH 03/23] fix: correct setting of global batch size

Also add option to give a prefix to the name of the training
directory
---
 mlpf/pipeline.py      | 9 +++++----
 mlpf/tfmodel/utils.py | 3 +--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index e4b85d70c..e278b3ea6 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -53,7 +53,8 @@ def main():
 @click.option("--ntrain", default=None, help="override the number of training events", type=int)
 @click.option("--ntest", default=None, help="override the number of testing events", type=int)
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
-def train(config, weights, ntrain, ntest, recreate):
+@click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
+def train(config, weights, ntrain, ntest, recreate, prefix):
     """Train a model defined by config
     """
     config_file_stem = Path(config).stem
@@ -134,12 +135,12 @@ def train(config, weights, ntrain, ntest, recreate):
         weights = config["setup"]["weights"]
 
     if recreate or (weights is None):
-        outdir = create_experiment_dir(prefix=config_file_stem + "_", suffix=platform.node())
+        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
     else:
         outdir = str(Path(weights).parent)
 
     # Decide tf.distribute.strategy depending on number of available GPUs
-    strategy, maybe_global_batch_size = get_strategy()
+    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
 
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
@@ -339,7 +340,7 @@ def evaluate(config, train_dir, weights, evaluation_dir):
 
     global_batch_size = config["setup"]["batch_size"]
 
-    strategy, maybe_global_batch_size = get_strategy()
+    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
 
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 24471504a..798ca128f 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -28,8 +28,7 @@ def create_experiment_dir(prefix=None, suffix=None):
     return str(train_dir)
 
 
-def get_strategy():
-    global_batch_size = None
+def get_strategy(global_batch_size):
     try:
         gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
         num_gpus = len(gpus)

From b896392a2d3e3754f26dbe53c5d8812c6e4f4441 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 28 Jun 2021 14:54:56 +0200
Subject: [PATCH 04/23] fix: do not silently scale learning rate with batch
 size

Also add lr_schedule parameter to configuration files
---
 mlpf/tfmodel/model_setup.py                  | 11 ++---------
 parameters/cms-gnn-dense-big.yaml            |  1 +
 parameters/cms-gnn-dense-focal.yaml          |  1 +
 parameters/cms-gnn-dense-transfer.yaml       |  1 +
 parameters/cms-gnn-dense.yaml                |  1 +
 parameters/cms-gnn-skipconn-v2.yaml          |  1 +
 parameters/cms-gnn-skipconn.yaml             |  1 +
 parameters/cms-transformer-skipconn-gun.yaml |  1 +
 parameters/cms-transformer-skipconn.yaml     |  1 +
 parameters/delphes-gnn-skipconn.yaml         |  1 +
 parameters/delphes-transformer-skipconn.yaml |  1 +
 parameters/test-cms-v2.yaml                  |  1 +
 parameters/test-cms.yaml                     |  1 +
 parameters/test-delphes.yaml                 |  1 +
 14 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 77cc94a78..64e5d7b96 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -554,8 +554,6 @@ def main(args, yaml_path, config):
         print("fallback to CPU", e)
         strategy = tf.distribute.OneDeviceStrategy("cpu")
         num_gpus = 0
-
-    actual_lr = global_batch_size*float(config['setup']['lr'])
     
     Xs = []
     ygens = []
@@ -580,15 +578,10 @@ def main(args, yaml_path, config):
     ygen_val = np.concatenate(ygens)
     ycand_val = np.concatenate(ycands)
 
+    lr = global_batch_size*float(config['setup']['lr'])
     with strategy.scope():
-        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
-            actual_lr,
-            decay_steps=10000,
-            decay_rate=0.99,
-            staircase=True
-        )
         total_steps = n_epochs * n_train // global_batch_size
-        lr_schedule, optim_callbacks = get_lr_schedule(config, actual_lr, steps=total_steps)
+        lr_schedule, optim_callbacks = get_lr_schedule(config, lr, steps=total_steps)
         opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
         if config['setup']['dtype'] == 'float16':
             model_dtype = tf.dtypes.float16
diff --git a/parameters/cms-gnn-dense-big.yaml b/parameters/cms-gnn-dense-big.yaml
index 3a03e8f20..6c4c62059 100644
--- a/parameters/cms-gnn-dense-big.yaml
+++ b/parameters/cms-gnn-dense-big.yaml
@@ -51,6 +51,7 @@ setup:
   sample_weights: inverse_sqrt
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn_dense
diff --git a/parameters/cms-gnn-dense-focal.yaml b/parameters/cms-gnn-dense-focal.yaml
index 2f715fd2c..15ce45126 100644
--- a/parameters/cms-gnn-dense-focal.yaml
+++ b/parameters/cms-gnn-dense-focal.yaml
@@ -52,6 +52,7 @@ setup:
   sample_weights: none
   trainable: all
   classification_loss_type: sigmoid_focal_crossentropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn_dense
diff --git a/parameters/cms-gnn-dense-transfer.yaml b/parameters/cms-gnn-dense-transfer.yaml
index e55cc9407..688922a84 100644
--- a/parameters/cms-gnn-dense-transfer.yaml
+++ b/parameters/cms-gnn-dense-transfer.yaml
@@ -51,6 +51,7 @@ setup:
   sample_weights: inverse_sqrt
   trainable: transfer
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn_dense
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index 6089456a9..ab0087373 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -53,6 +53,7 @@ setup:
   sample_weights: inverse_sqrt
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn_dense
diff --git a/parameters/cms-gnn-skipconn-v2.yaml b/parameters/cms-gnn-skipconn-v2.yaml
index c13f7d854..0bb9c9220 100644
--- a/parameters/cms-gnn-skipconn-v2.yaml
+++ b/parameters/cms-gnn-skipconn-v2.yaml
@@ -51,6 +51,7 @@ setup:
   sample_weights: inverse_sqrt
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn
diff --git a/parameters/cms-gnn-skipconn.yaml b/parameters/cms-gnn-skipconn.yaml
index f0c9aa51e..1f23797b7 100644
--- a/parameters/cms-gnn-skipconn.yaml
+++ b/parameters/cms-gnn-skipconn.yaml
@@ -51,6 +51,7 @@ setup:
   sample_weights: none
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn
diff --git a/parameters/cms-transformer-skipconn-gun.yaml b/parameters/cms-transformer-skipconn-gun.yaml
index d079d71f2..180cb513d 100644
--- a/parameters/cms-transformer-skipconn-gun.yaml
+++ b/parameters/cms-transformer-skipconn-gun.yaml
@@ -52,6 +52,7 @@ setup:
   sample_weights: inverse_sqrt
   trainable: all
   multi_output: yes
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: transformer
diff --git a/parameters/cms-transformer-skipconn.yaml b/parameters/cms-transformer-skipconn.yaml
index 767f34416..f8ea796b9 100644
--- a/parameters/cms-transformer-skipconn.yaml
+++ b/parameters/cms-transformer-skipconn.yaml
@@ -50,6 +50,7 @@ setup:
   sample_weights: none
   trainable: cls
   multi_output: yes
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: transformer
diff --git a/parameters/delphes-gnn-skipconn.yaml b/parameters/delphes-gnn-skipconn.yaml
index 88fd5f189..d73066042 100644
--- a/parameters/delphes-gnn-skipconn.yaml
+++ b/parameters/delphes-gnn-skipconn.yaml
@@ -41,6 +41,7 @@ setup:
   trainable: all
   multi_output: no
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn
diff --git a/parameters/delphes-transformer-skipconn.yaml b/parameters/delphes-transformer-skipconn.yaml
index f687fd63e..9f3113c34 100644
--- a/parameters/delphes-transformer-skipconn.yaml
+++ b/parameters/delphes-transformer-skipconn.yaml
@@ -39,6 +39,7 @@ setup:
   sample_weights: none
   trainable: all
   multi_output: no
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: transformer
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 25f9a0a5e..763b5e0ca 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -39,6 +39,7 @@ setup:
   sample_weights: none
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn_dense
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index 939e37fc8..175c84686 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -39,6 +39,7 @@ setup:
   sample_weights: none
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index 058836b7e..1c36387a5 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -38,6 +38,7 @@ setup:
   sample_weights: none
   trainable: all
   classification_loss_type: categorical_cross_entropy
+  lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
 parameters:
   model: gnn

From 9fcba84f0d94141ed9ca6399d93a9fc3a3f1a1f8 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 28 Jun 2021 15:47:13 +0200
Subject: [PATCH 05/23] fix: do not silently scale learning rate with batch
 size

The previous commit still scaled the LR, this one fixes it.
---
 mlpf/tfmodel/model_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 64e5d7b96..9c3a14a0d 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -578,7 +578,7 @@ def main(args, yaml_path, config):
     ygen_val = np.concatenate(ygens)
     ycand_val = np.concatenate(ycands)
 
-    lr = global_batch_size*float(config['setup']['lr'])
+    lr = float(config['setup']['lr'])
     with strategy.scope():
         total_steps = n_epochs * n_train // global_batch_size
         lr_schedule, optim_callbacks = get_lr_schedule(config, lr, steps=total_steps)

From 2ea5c8b5e527acebb189fde176379387fdfe80cf Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 28 Jun 2021 16:47:41 +0200
Subject: [PATCH 06/23] refactoring to make pipeline.py simpler

- create get_train_val_datasets() function to get datasets for training
- move targets_multi_output() from model_setup.py to utils.py for more
  flexible access (solving import loop issue)
---
 mlpf/pipeline.py            | 78 ++++++-------------------------------
 mlpf/tfmodel/model_setup.py | 14 +------
 mlpf/tfmodel/utils.py       | 76 ++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 79 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index e278b3ea6..3f1162f1a 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -17,7 +17,6 @@
 
 from tfmodel.data import Dataset
 from tfmodel.model_setup import (
-    targets_multi_output,
     make_model,
     configure_model_weights,
     LearningRateLoggingCallback,
@@ -35,6 +34,8 @@
     load_config,
     compute_weights_invsqrt,
     compute_weights_none,
+    get_train_val_datasets,
+    targets_multi_output,
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -60,76 +61,21 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     config_file_stem = Path(config).stem
     config = load_config(config)
     tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
-
-    cds = config["dataset"]
-    dataset_def = Dataset(
-        num_input_features=int(cds["num_input_features"]),
-        num_output_features=int(cds["num_output_features"]),
-        padded_num_elem_size=int(cds["padded_num_elem_size"]),
-        raw_path=cds.get("raw_path", None),
-        raw_files=cds.get("raw_files", None),
-        processed_path=cds["processed_path"],
-        validation_file_path=cds["validation_file_path"],
-        schema=cds["schema"],
-    )
-
     global_batch_size = config["setup"]["batch_size"]
-    if "multi_output" in config["setup"]:
-        multi_output = config["setup"]["multi_output"]
-    else:
-        multi_output = True
-        config["setup"]["multi_output"] = True
-    n_train = config["setup"]["num_events_train"]
-    n_test = config["setup"]["num_events_test"]
+    n_epochs = config["setup"]["num_epochs"]
     if ntrain:
         n_train = ntrain
+    else:
+        n_train = config["setup"]["num_events_train"]
     if ntest:
         n_test = ntest
-
-    n_epochs = config["setup"]["num_epochs"]
-    total_steps = n_epochs * n_train // global_batch_size
-
-    tfr_files = sorted(glob.glob(dataset_def.processed_path))
-    if len(tfr_files) == 0:
-        raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
-
-    random.shuffle(tfr_files)
-    dataset = tf.data.TFRecordDataset(tfr_files).map(
-        dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE
-    )
-
-    # Due to TFRecords format, the length of the dataset is not known beforehand
-    num_events = 0
-    for i in dataset:
-        num_events += 1
-    print("dataset loaded, len={}".format(num_events))
-
-    weight_func = get_weights_func(config)
-    assert n_train + n_test <= num_events
-
-    # Padded shapes
-    ps = (
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
-        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
-        tf.TensorShape(
-            [
-                dataset_def.padded_num_elem_size,
-            ]
-        ),
-    )
-
-    ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
-
-    if multi_output:
-        dataset_transform = targets_multi_output(config["dataset"]["num_output_classes"])
-        ds_train = ds_train.map(dataset_transform)
-        ds_test = ds_test.map(dataset_transform)
     else:
-        dataset_transform = None
+        n_test = config["setup"]["num_events_test"]
+    total_steps = n_epochs * n_train // global_batch_size
 
-    ds_train_r = ds_train.repeat(n_epochs)
-    ds_test_r = ds_test.repeat(n_epochs)
+    if "multi_output" not in config["setup"]:
+        config["setup"]["multi_output"] = True
+    dataset_def, ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
 
     if weights is None:
         weights = config["setup"]["weights"]
@@ -145,7 +91,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
-    actual_lr = global_batch_size * float(config["setup"]["lr"])
+    lr = float(config["setup"]["lr"])
 
     val_filelist = dataset_def.val_filelist
     if config["setup"]["num_val_files"] > 0:
@@ -166,7 +112,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
     ycand_val = np.concatenate(ycands)
 
     with strategy.scope():
-        lr_schedule, optim_callbacks = get_lr_schedule(config, lr=actual_lr, steps=total_steps)
+        lr_schedule, optim_callbacks = get_lr_schedule(config, lr=lr, steps=total_steps)
         opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
         if config["setup"]["dtype"] == "float16":
             model_dtype = tf.dtypes.float16
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 9c3a14a0d..52aae79a6 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -26,7 +26,7 @@
 from pathlib import Path
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.callbacks import CustomTensorBoard
-from tfmodel.utils import get_lr_schedule, get_weights_func
+from tfmodel.utils import get_lr_schedule, get_weights_func, targets_multi_output
 
 
 def plot_confusion_matrix(cm):
@@ -230,18 +230,6 @@ def scale_outputs(X,y,w):
     ynew = ynew/out_s
     return X, ynew, w
 
-def targets_multi_output(num_output_classes):
-    def func(X, y, w):
-        return X, {
-            "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes), 
-            "charge": y[:, :, 1:2],
-            "pt": y[:, :, 2:3],
-            "eta": y[:, :, 3:4],
-            "sin_phi": y[:, :, 4:5],
-            "cos_phi": y[:, :, 5:6],
-            "energy": y[:, :, 6:7],
-        }, w
-    return func
 
 def make_model(config, dtype):
     model = config['parameters']['model']
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 798ca128f..4631ce838 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -3,9 +3,12 @@
 from pathlib import Path
 import datetime
 import platform
+import random
+import glob
 
 import tensorflow as tf
 
+from tfmodel.data import Dataset
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 
 
@@ -98,3 +101,76 @@ def get_weights_func(config):
         return compute_weights_none
     else:
         raise ValueError("Only supported weight samplings are 'inverse_sqrt' and 'none'.")
+
+
+def targets_multi_output(num_output_classes):
+    def func(X, y, w):
+        return X, {
+            "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes), 
+            "charge": y[:, :, 1:2],
+            "pt": y[:, :, 2:3],
+            "eta": y[:, :, 3:4],
+            "sin_phi": y[:, :, 4:5],
+            "cos_phi": y[:, :, 5:6],
+            "energy": y[:, :, 6:7],
+        }, w
+    return func
+
+
+def get_train_val_datasets(config, global_batch_size, n_train, n_test):
+    cds = config["dataset"]
+
+    dataset_def = Dataset(
+        num_input_features=int(cds["num_input_features"]),
+        num_output_features=int(cds["num_output_features"]),
+        padded_num_elem_size=int(cds["padded_num_elem_size"]),
+        raw_path=cds.get("raw_path", None),
+        raw_files=cds.get("raw_files", None),
+        processed_path=cds["processed_path"],
+        validation_file_path=cds["validation_file_path"],
+        schema=cds["schema"],
+    )
+
+    tfr_files = sorted(glob.glob(dataset_def.processed_path))
+    if len(tfr_files) == 0:
+        raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
+
+    random.shuffle(tfr_files)
+    dataset = tf.data.TFRecordDataset(tfr_files).map(
+        dataset_def.parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE
+    )
+
+    # Due to TFRecords format, the length of the dataset is not known beforehand
+    num_events = 0
+    for _ in dataset:
+        num_events += 1
+    print("dataset loaded, len={}".format(num_events))
+
+    weight_func = get_weights_func(config)
+    assert n_train + n_test <= num_events
+
+    # Padded shapes
+    ps = (
+        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
+        tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
+        tf.TensorShape(
+            [
+                dataset_def.padded_num_elem_size,
+            ]
+        ),
+    )
+
+    ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
+    ds_test = dataset.skip(n_train).take(n_test).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
+
+    if config["setup"]["multi_output"]:
+        dataset_transform = targets_multi_output(config["dataset"]["num_output_classes"])
+        ds_train = ds_train.map(dataset_transform)
+        ds_test = ds_test.map(dataset_transform)
+    else:
+        dataset_transform = None
+
+    ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
+    ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
+
+    return dataset_def, ds_train_r, ds_test_r, dataset_transform

From 99721c56ad4d2e680a81c69afe90cad69c2aff11 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 28 Jun 2021 18:28:52 +0200
Subject: [PATCH 07/23] feat: Learning Rate Finder

The learning rate finder implements a technique to easily estimate
a range of learning rates that should perform well given the
current model setup. When the model architecture or other
hyperparameters are changed, the learning rate finder can be run in
order to find a new suitable LR range.

The learning rate finder starts training the model at a very low LR,
increasing it every batch. The batch loss is plotted vs training
steps and a figure is created from which a suitable LR range can be
determined.

This technique was first introduced by Leslie Smith in
https://arxiv.org/abs/1506.01186.
---
 mlpf/pipeline.py          | 112 +++++++++++++++++++++++++++++++++++++-
 mlpf/tfmodel/lr_finder.py |  71 ++++++++++++++++++++++++
 2 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 mlpf/tfmodel/lr_finder.py

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 3f1162f1a..ad67d9eb3 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -39,7 +39,7 @@
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
-
+from tfmodel.lr_finder import LRFinder
 
 @click.group()
 @click.help_option("-h", "--help")
@@ -309,5 +309,115 @@ def evaluate(config, train_dir, weights, evaluation_dir):
         freeze_model(model, config, eval_dir)
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-c", "--config", help="configuration file", type=click.Path())
+@click.option("-o", "--outdir", help="output directory", type=click.Path(), default=".")
+@click.option("-n", "--figname", help="name of saved figure", type=click.Path(), default="lr_finder.jpg")
+@click.option("-l", "--log_scale", help="use log scale on y-axis in figure", type=click.Path(), default=False, is_flag=True)
+def find_lr(config, outdir, figname, log_scale):
+    config = load_config(config)
+    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
+    global_batch_size = config["setup"]["batch_size"]
+
+    if "multi_output" not in config["setup"]:
+        config["setup"]["multi_output"] = True
+    n_train = config["setup"]["num_events_train"]
+    dataset_def, ds_train_r, _, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test=0)
+
+    # Decide tf.distribute.strategy depending on number of available GPUs
+    strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+
+    # If using more than 1 GPU, we scale the batch size by the number of GPUs
+    if maybe_global_batch_size is not None:
+        global_batch_size = maybe_global_batch_size
+    lr = float(config["setup"]["lr"])
+
+    Xs = []
+    for fi in dataset_def.val_filelist[:1]:
+        X, ygen, ycand = dataset_def.prepare_data(fi)
+        Xs.append(np.concatenate(X))
+
+    assert len(Xs) > 0, "Xs is empty"
+    X_val = np.concatenate(Xs)
+
+    with strategy.scope():
+        opt = tf.keras.optimizers.Adam(learning_rate=1e-7)  # This learning rate will be changed by the lr_finder
+        if config["setup"]["dtype"] == "float16":
+            model_dtype = tf.dtypes.float16
+            policy = mixed_precision.Policy("mixed_float16")
+            mixed_precision.set_global_policy(policy)
+            opt = mixed_precision.LossScaleOptimizer(opt)
+        else:
+            model_dtype = tf.dtypes.float32
+
+        model = make_model(config, model_dtype)
+
+        if config["setup"]["trainable"] == "classification":
+            config["dataset"]["pt_loss_coef"] = 0.0
+            config["dataset"]["eta_loss_coef"] = 0.0
+            config["dataset"]["sin_phi_loss_coef"] = 0.0
+            config["dataset"]["cos_phi_loss_coef"] = 0.0
+            config["dataset"]["energy_loss_coef"] = 0.0
+        elif config["setup"]["trainable"] == "regression":
+            config["dataset"]["classification_loss_coef"] = 0.0
+            config["dataset"]["charge_loss_coef"] = 0.0
+
+        # Run model once to build the layers
+        model(tf.cast(X_val[:1], model_dtype))
+
+        configure_model_weights(model, config["setup"]["trainable"])
+
+        if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
+            cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
+        elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
+            cls_loss = tfa.losses.sigmoid_focal_crossentropy
+        else:
+            raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
+
+        model.compile(
+            loss={
+                "cls": cls_loss,
+                "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
+                "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
+                "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
+                "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
+                "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
+                "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
+            },
+            optimizer=opt,
+            sample_weight_mode="temporal",
+            loss_weights={
+                "cls": config["dataset"]["classification_loss_coef"],
+                "charge": config["dataset"]["charge_loss_coef"],
+                "pt": config["dataset"]["pt_loss_coef"],
+                "eta": config["dataset"]["eta_loss_coef"],
+                "sin_phi": config["dataset"]["sin_phi_loss_coef"],
+                "cos_phi": config["dataset"]["cos_phi_loss_coef"],
+                "energy": config["dataset"]["energy_loss_coef"],
+            },
+            metrics={
+                "cls": [
+                    FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
+                    FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64),
+                ]
+            },
+        )
+        model.summary()
+
+        max_steps = 200
+        lr_finder = LRFinder(max_steps=max_steps)
+        callbacks = [lr_finder]
+
+        fit_result = model.fit(
+            ds_train_r,
+            epochs=max_steps,
+            callbacks=callbacks,
+            steps_per_epoch=1,
+        )
+
+        lr_finder.plot(save_dir=outdir, figname=figname, log_scale=log_scale)
+
+
 if __name__ == "__main__":
     main()
diff --git a/mlpf/tfmodel/lr_finder.py b/mlpf/tfmodel/lr_finder.py
new file mode 100644
index 000000000..152b69417
--- /dev/null
+++ b/mlpf/tfmodel/lr_finder.py
@@ -0,0 +1,71 @@
+import tensorflow as tf
+from tensorflow.keras.callbacks import Callback
+import matplotlib.pyplot as plt
+from pathlib import Path
+
+
+class LRFinder(Callback):
+    """`Callback` that exponentially adjusts the learning rate after each training batch between `start_lr` and
+    `end_lr` for a maximum number of batches: `max_step`. The loss and learning rate are recorded at each step allowing
+    visually finding a good learning rate as per https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html via
+    the `plot` method.
+
+    A version of this learning rate finder technique is also described under the name 'LR range test' in Leslie Smith's
+    paper: https://arxiv.org/pdf/1803.09820.pdf.
+    """
+
+    def __init__(self, start_lr: float = 1e-7, end_lr: float = 3, max_steps: int = 200, smoothing=0.9):
+        super(LRFinder, self).__init__()
+        self.start_lr, self.end_lr = start_lr, end_lr
+        self.max_steps = max_steps
+        self.smoothing = smoothing
+        self.step, self.best_loss, self.avg_loss, self.lr = 0, 0, 0, 0
+        self.lrs, self.losses = [], []
+
+    def on_train_begin(self, logs=None):
+        self.step, self.best_loss, self.avg_loss, self.lr = 0, 0, 0, 0
+        self.lrs, self.losses = [], []
+
+    def on_train_batch_begin(self, batch, logs=None):
+        self.lr = self.exp_annealing(self.step)
+        tf.keras.backend.set_value(self.model.optimizer.lr, self.lr)
+
+    def on_train_batch_end(self, batch, logs=None):
+        print("lr:", self.lr)
+        print("step", self.step)
+        logs = logs or {}
+        loss = logs.get("loss")
+        step = self.step
+        if loss:
+            print("loss", loss)
+            self.avg_loss = self.smoothing * self.avg_loss + (1 - self.smoothing) * loss
+            smooth_loss = self.avg_loss / (1 - self.smoothing ** (self.step + 1))
+            self.losses.append(smooth_loss)
+            self.lrs.append(self.lr)
+
+            if step == 0 or loss < self.best_loss:
+                self.best_loss = loss
+
+            if smooth_loss > 4 * self.best_loss or tf.math.is_nan(smooth_loss):
+                self.model.stop_training = True
+                print("Loss reached predefined maximum... stopping")
+        if step >= self.max_steps:
+            print("STOPPING")
+            self.model.stop_training = True
+        self.step += 1
+
+    def exp_annealing(self, step):
+        return self.start_lr * (self.end_lr / self.start_lr) ** (step * 1.0 / self.max_steps)
+
+    def plot(self, save_dir=None, figname="lr_finder.jpg", log_scale=False):
+        fig, ax = plt.subplots(1, 1)
+        ax.set_ylabel("Loss")
+        ax.set_xlabel("Learning Rate")
+        ax.set_xscale("log")
+        ax.xaxis.set_major_formatter(plt.FormatStrFormatter("%.0e"))
+        ax.plot(self.lrs, self.losses)
+        if log_scale:
+            ax.set_yscale("log")
+        if save_dir is not None:
+            Path(save_dir).mkdir(parents=True, exist_ok=True)
+            plt.savefig(str(Path(save_dir) / Path(figname)))

From 47db0d57fb52876915587a5e3854c773b83aa368 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 29 Jun 2021 10:18:03 +0200
Subject: [PATCH 08/23] fix: typo in OneCycleScheduler docstring

---
 mlpf/tfmodel/onecycle_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/onecycle_scheduler.py b/mlpf/tfmodel/onecycle_scheduler.py
index 35d2a240f..fefc65c78 100644
--- a/mlpf/tfmodel/onecycle_scheduler.py
+++ b/mlpf/tfmodel/onecycle_scheduler.py
@@ -23,7 +23,7 @@ def step(self):
 
 
 class OneCycleScheduler(LearningRateSchedule):
-    """`Callback` that schedules the learning rate on a 1cycle policy as per Leslie Smith's paper
+    """`LearningRateSchedule` that schedules the learning rate on a 1cycle policy as per Leslie Smith's paper
     (https://arxiv.org/pdf/1803.09820.pdf).
 
     The implementation adopts additional improvements as per the fastai library:

From bb6fbddfa26f83aa153d7e56df537edf6778500e Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 29 Jun 2021 13:35:16 +0200
Subject: [PATCH 09/23] add installation of tqdm to github test

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7fb27e281..386083be0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,7 +18,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_tf.sh
 
@@ -31,7 +31,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
       - name: Run CMS TF model
         run: ./scripts/local_test_cms_tf.sh
   

From 771cc6f2ab101c012e53ee9c6079629ee85611d5 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Tue, 29 Jun 2021 15:58:45 +0200
Subject: [PATCH 10/23] chore: reduce code duplication in mlpf/pipeline.py

---
 mlpf/pipeline.py                       | 155 ++++---------------------
 mlpf/tfmodel/utils.py                  | 108 +++++++++++++++--
 parameters/cms-gnn-dense-onecycle.yaml |   2 +-
 3 files changed, 121 insertions(+), 144 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ad67d9eb3..5d9cd1ade 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -36,6 +36,10 @@
     compute_weights_none,
     get_train_val_datasets,
     targets_multi_output,
+    get_dataset_def,
+    prepare_val_data,
+    set_config_loss,
+    get_loss_dict,
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -71,11 +75,13 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         n_test = ntest
     else:
         n_test = config["setup"]["num_events_test"]
-    total_steps = n_epochs * n_train // global_batch_size
 
     if "multi_output" not in config["setup"]:
         config["setup"]["multi_output"] = True
-    dataset_def, ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
+
+    dataset_def = get_dataset_def(config)
+    ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=True)
 
     if weights is None:
         weights = config["setup"]["weights"]
@@ -87,33 +93,17 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
+    total_steps = n_epochs * n_train // global_batch_size
 
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
     lr = float(config["setup"]["lr"])
 
-    val_filelist = dataset_def.val_filelist
-    if config["setup"]["num_val_files"] > 0:
-        val_filelist = val_filelist[: config["setup"]["num_val_files"]]
-
-    Xs = []
-    ygens = []
-    ycands = []
-    for fi in tqdm(val_filelist[:1], desc="Preparing validation data"):
-        X, ygen, ycand = dataset_def.prepare_data(fi)
-        Xs.append(np.concatenate(X))
-        ygens.append(np.concatenate(ygen))
-        ycands.append(np.concatenate(ycand))
-
-    assert len(Xs) > 0, "Xs is empty"
-    X_val = np.concatenate(Xs)
-    ygen_val = np.concatenate(ygens)
-    ycand_val = np.concatenate(ycands)
-
     with strategy.scope():
         lr_schedule, optim_callbacks = get_lr_schedule(config, lr=lr, steps=total_steps)
         opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+
         if config["setup"]["dtype"] == "float16":
             model_dtype = tf.dtypes.float16
             policy = mixed_precision.Policy("mixed_float16")
@@ -136,47 +126,16 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
             initial_epoch = int(weights.split("/")[-1].split("-")[1])
         model(tf.cast(X_val[:1], model_dtype))
 
-        if config["setup"]["trainable"] == "classification":
-            config["dataset"]["pt_loss_coef"] = 0.0
-            config["dataset"]["eta_loss_coef"] = 0.0
-            config["dataset"]["sin_phi_loss_coef"] = 0.0
-            config["dataset"]["cos_phi_loss_coef"] = 0.0
-            config["dataset"]["energy_loss_coef"] = 0.0
-        elif config["setup"]["trainable"] == "regression":
-            config["dataset"]["classification_loss_coef"] = 0.0
-            config["dataset"]["charge_loss_coef"] = 0.0
-
+        config = set_config_loss(config, config["setup"]["trainable"])
         configure_model_weights(model, config["setup"]["trainable"])
         model(tf.cast(X_val[:1], model_dtype))
 
-        if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
-            cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
-        elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
-            cls_loss = tfa.losses.sigmoid_focal_crossentropy
-        else:
-            raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
-
+        loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
-            loss={
-                "cls": cls_loss,
-                "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
-                "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
-                "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
-                "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
-                "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
-                "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
-            },
+            loss=loss_dict,
             optimizer=opt,
             sample_weight_mode="temporal",
-            loss_weights={
-                "cls": config["dataset"]["classification_loss_coef"],
-                "charge": config["dataset"]["charge_loss_coef"],
-                "pt": config["dataset"]["pt_loss_coef"],
-                "eta": config["dataset"]["eta_loss_coef"],
-                "sin_phi": config["dataset"]["sin_phi_loss_coef"],
-                "cos_phi": config["dataset"]["cos_phi_loss_coef"],
-                "energy": config["dataset"]["energy_loss_coef"],
-            },
+            loss_weights=loss_weights,
             metrics={
                 "cls": [
                     FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
@@ -194,7 +153,6 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
             dataset_transform,
             config["dataset"]["num_output_classes"],
         )
-        callbacks.append(LearningRateLoggingCallback())
         callbacks.append(optim_callbacks)
 
         fit_result = model.fit(
@@ -255,43 +213,15 @@ def evaluate(config, train_dir, weights, evaluation_dir):
     else:
         model_dtype = tf.dtypes.float32
 
-    cds = config["dataset"]
-    dataset_def = Dataset(
-        num_input_features=int(cds["num_input_features"]),
-        num_output_features=int(cds["num_output_features"]),
-        padded_num_elem_size=int(cds["padded_num_elem_size"]),
-        raw_path=cds.get("raw_path", None),
-        raw_files=cds.get("raw_files", None),
-        processed_path=cds["processed_path"],
-        validation_file_path=cds["validation_file_path"],
-        schema=cds["schema"],
-    )
-
-    Xs = []
-    ygens = []
-    ycands = []
-    val_filelist = dataset_def.val_filelist
-    if config["setup"]["num_val_files"] > 0:
-        val_filelist = val_filelist[: config["setup"]["num_val_files"]]
-
-    for fi in tqdm(val_filelist, desc="Preparing validation data"):
-        X, ygen, ycand = dataset_def.prepare_data(fi)
-        Xs.append(np.concatenate(X))
-        ygens.append(np.concatenate(ygen))
-        ycands.append(np.concatenate(ycand))
-    assert len(Xs) > 0
-    X_val = np.concatenate(Xs)
-    ygen_val = np.concatenate(ygens)
-    ycand_val = np.concatenate(ycands)
+    dataset_def = get_dataset_def(config)
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def)
 
     global_batch_size = config["setup"]["batch_size"]
-
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
 
     with strategy.scope():
-
         model = make_model(config, model_dtype)
 
         # Evaluate model once to build the layers
@@ -323,7 +253,7 @@ def find_lr(config, outdir, figname, log_scale):
     if "multi_output" not in config["setup"]:
         config["setup"]["multi_output"] = True
     n_train = config["setup"]["num_events_train"]
-    dataset_def, ds_train_r, _, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test=0)
+    ds_train_r, _, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test=0)
 
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
@@ -331,15 +261,9 @@ def find_lr(config, outdir, figname, log_scale):
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
-    lr = float(config["setup"]["lr"])
-
-    Xs = []
-    for fi in dataset_def.val_filelist[:1]:
-        X, ygen, ycand = dataset_def.prepare_data(fi)
-        Xs.append(np.concatenate(X))
 
-    assert len(Xs) > 0, "Xs is empty"
-    X_val = np.concatenate(Xs)
+    dataset_def = get_dataset_def(config)
+    X_val, _, _ = prepare_val_data(config, dataset_def)
 
     with strategy.scope():
         opt = tf.keras.optimizers.Adam(learning_rate=1e-7)  # This learning rate will be changed by the lr_finder
@@ -352,50 +276,19 @@ def find_lr(config, outdir, figname, log_scale):
             model_dtype = tf.dtypes.float32
 
         model = make_model(config, model_dtype)
-
-        if config["setup"]["trainable"] == "classification":
-            config["dataset"]["pt_loss_coef"] = 0.0
-            config["dataset"]["eta_loss_coef"] = 0.0
-            config["dataset"]["sin_phi_loss_coef"] = 0.0
-            config["dataset"]["cos_phi_loss_coef"] = 0.0
-            config["dataset"]["energy_loss_coef"] = 0.0
-        elif config["setup"]["trainable"] == "regression":
-            config["dataset"]["classification_loss_coef"] = 0.0
-            config["dataset"]["charge_loss_coef"] = 0.0
+        config = set_config_loss(config, config["setup"]["trainable"])
 
         # Run model once to build the layers
         model(tf.cast(X_val[:1], model_dtype))
 
         configure_model_weights(model, config["setup"]["trainable"])
 
-        if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
-            cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
-        elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
-            cls_loss = tfa.losses.sigmoid_focal_crossentropy
-        else:
-            raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
-
+        loss_dict, loss_weights = get_loss_dict(config)
         model.compile(
-            loss={
-                "cls": cls_loss,
-                "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
-                "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
-                "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
-                "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
-                "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
-                "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
-            },
+            loss=loss_dict,
             optimizer=opt,
             sample_weight_mode="temporal",
-            loss_weights={
-                "cls": config["dataset"]["classification_loss_coef"],
-                "charge": config["dataset"]["charge_loss_coef"],
-                "pt": config["dataset"]["pt_loss_coef"],
-                "eta": config["dataset"]["eta_loss_coef"],
-                "sin_phi": config["dataset"]["sin_phi_loss_coef"],
-                "cos_phi": config["dataset"]["cos_phi_loss_coef"],
-                "energy": config["dataset"]["energy_loss_coef"],
-            },
+            loss_weights=loss_weights,
             metrics={
                 "cls": [
                     FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64),
@@ -409,7 +302,7 @@ def find_lr(config, outdir, figname, log_scale):
         lr_finder = LRFinder(max_steps=max_steps)
         callbacks = [lr_finder]
 
-        fit_result = model.fit(
+        model.fit(
             ds_train_r,
             epochs=max_steps,
             callbacks=callbacks,
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 4631ce838..07ed8bb50 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -5,6 +5,8 @@
 import platform
 import random
 import glob
+import numpy as np
+from tqdm import tqdm
 
 import tensorflow as tf
 
@@ -105,22 +107,27 @@ def get_weights_func(config):
 
 def targets_multi_output(num_output_classes):
     def func(X, y, w):
-        return X, {
-            "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes), 
-            "charge": y[:, :, 1:2],
-            "pt": y[:, :, 2:3],
-            "eta": y[:, :, 3:4],
-            "sin_phi": y[:, :, 4:5],
-            "cos_phi": y[:, :, 5:6],
-            "energy": y[:, :, 6:7],
-        }, w
+        return (
+            X,
+            {
+                "cls": tf.one_hot(tf.cast(y[:, :, 0], tf.int32), num_output_classes),
+                "charge": y[:, :, 1:2],
+                "pt": y[:, :, 2:3],
+                "eta": y[:, :, 3:4],
+                "sin_phi": y[:, :, 4:5],
+                "cos_phi": y[:, :, 5:6],
+                "energy": y[:, :, 6:7],
+            },
+            w,
+        )
+
     return func
 
 
-def get_train_val_datasets(config, global_batch_size, n_train, n_test):
+def get_dataset_def(config):
     cds = config["dataset"]
 
-    dataset_def = Dataset(
+    return Dataset(
         num_input_features=int(cds["num_input_features"]),
         num_output_features=int(cds["num_output_features"]),
         padded_num_elem_size=int(cds["padded_num_elem_size"]),
@@ -131,6 +138,10 @@ def get_train_val_datasets(config, global_batch_size, n_train, n_test):
         schema=cds["schema"],
     )
 
+
+def get_train_val_datasets(config, global_batch_size, n_train, n_test):
+    dataset_def = get_dataset_def(config)
+
     tfr_files = sorted(glob.glob(dataset_def.processed_path))
     if len(tfr_files) == 0:
         raise Exception("Could not find any files in {}".format(dataset_def.processed_path))
@@ -173,4 +184,77 @@ def get_train_val_datasets(config, global_batch_size, n_train, n_test):
     ds_train_r = ds_train.repeat(config["setup"]["num_epochs"])
     ds_test_r = ds_test.repeat(config["setup"]["num_epochs"])
 
-    return dataset_def, ds_train_r, ds_test_r, dataset_transform
+    return ds_train_r, ds_test_r, dataset_transform
+
+
+def prepare_val_data(config, dataset_def, single_file=False):
+    if single_file:
+        val_filelist = dataset_def.val_filelist[:1]
+    else:
+        val_filelist = dataset_def.val_filelist
+    if config["setup"]["num_val_files"] > 0:
+        val_filelist = val_filelist[: config["setup"]["num_val_files"]]
+
+    Xs = []
+    ygens = []
+    ycands = []
+    for fi in tqdm(val_filelist[:1], desc="Preparing validation data"):
+        X, ygen, ycand = dataset_def.prepare_data(fi)
+        Xs.append(np.concatenate(X))
+        ygens.append(np.concatenate(ygen))
+        ycands.append(np.concatenate(ycand))
+
+    assert len(Xs) > 0, "Xs is empty"
+    X_val = np.concatenate(Xs)
+    ygen_val = np.concatenate(ygens)
+    ycand_val = np.concatenate(ycands)
+
+    return X_val, ygen_val, ycand_val
+
+
+def set_config_loss(config, trainable):
+    if trainable == "classification":
+        config["dataset"]["pt_loss_coef"] = 0.0
+        config["dataset"]["eta_loss_coef"] = 0.0
+        config["dataset"]["sin_phi_loss_coef"] = 0.0
+        config["dataset"]["cos_phi_loss_coef"] = 0.0
+        config["dataset"]["energy_loss_coef"] = 0.0
+    elif trainable == "regression":
+        config["dataset"]["classification_loss_coef"] = 0.0
+        config["dataset"]["charge_loss_coef"] = 0.0
+    elif trainable == "all":
+        pass
+    return config
+
+
+def get_class_loss(config):
+    if config["setup"]["classification_loss_type"] == "categorical_cross_entropy":
+        cls_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
+    elif config["setup"]["classification_loss_type"] == "sigmoid_focal_crossentropy":
+        cls_loss = tfa.losses.sigmoid_focal_crossentropy
+    else:
+        raise KeyError("Unknown classification loss type: {}".format(config["setup"]["classification_loss_type"]))
+    return cls_loss
+
+
+def get_loss_dict(config):
+    cls_loss = get_class_loss(config)
+    loss_dict = {
+        "cls": cls_loss,
+        "charge": getattr(tf.keras.losses, config["dataset"].get("charge_loss", "MeanSquaredError"))(),
+        "pt": getattr(tf.keras.losses, config["dataset"].get("pt_loss", "MeanSquaredError"))(),
+        "eta": getattr(tf.keras.losses, config["dataset"].get("eta_loss", "MeanSquaredError"))(),
+        "sin_phi": getattr(tf.keras.losses, config["dataset"].get("sin_phi_loss", "MeanSquaredError"))(),
+        "cos_phi": getattr(tf.keras.losses, config["dataset"].get("cos_phi_loss", "MeanSquaredError"))(),
+        "energy": getattr(tf.keras.losses, config["dataset"].get("energy_loss", "MeanSquaredError"))(),
+    }
+    loss_weights = {
+        "cls": config["dataset"]["classification_loss_coef"],
+        "charge": config["dataset"]["charge_loss_coef"],
+        "pt": config["dataset"]["pt_loss_coef"],
+        "eta": config["dataset"]["eta_loss_coef"],
+        "sin_phi": config["dataset"]["sin_phi_loss_coef"],
+        "cos_phi": config["dataset"]["cos_phi_loss_coef"],
+        "energy": config["dataset"]["energy_loss_coef"],
+    }
+    return loss_dict, loss_weights
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 92c36bc5f..f194d2432 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -43,7 +43,7 @@ setup:
   weights:
   weights_config: all
   lr: 1e-4
-  batch_size: 16
+  batch_size: 32
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 400

From 71b353408973e5814339af1164a2d9b4b4e30dde Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 1 Jul 2021 15:22:47 +0200
Subject: [PATCH 11/23] chore: Reduction of code duplication

---
 mlpf/pipeline.py      | 46 ++++++++-----------------------------------
 mlpf/tfmodel/utils.py | 24 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 5d9cd1ade..ba290d44e 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -40,6 +40,7 @@
     prepare_val_data,
     set_config_loss,
     get_loss_dict,
+    parse_config,
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -62,30 +63,12 @@ def main():
 def train(config, weights, ntrain, ntest, recreate, prefix):
     """Train a model defined by config
     """
-    config_file_stem = Path(config).stem
-    config = load_config(config)
-    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
-    global_batch_size = config["setup"]["batch_size"]
-    n_epochs = config["setup"]["num_epochs"]
-    if ntrain:
-        n_train = ntrain
-    else:
-        n_train = config["setup"]["num_events_train"]
-    if ntest:
-        n_test = ntest
-    else:
-        n_test = config["setup"]["num_events_test"]
-
-    if "multi_output" not in config["setup"]:
-        config["setup"]["multi_output"] = True
+    config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(config, ntrain, ntest, weights)
 
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
     X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=True)
 
-    if weights is None:
-        weights = config["setup"]["weights"]
-
     if recreate or (weights is None):
         outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
     else:
@@ -93,11 +76,10 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
-    total_steps = n_epochs * n_train // global_batch_size
-
     # If using more than 1 GPU, we scale the batch size by the number of GPUs
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
+    total_steps = n_epochs * n_train // global_batch_size
     lr = float(config["setup"]["lr"])
 
     with strategy.scope():
@@ -190,7 +172,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 @click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
 @click.option("-e", "--evaluation_dir", help="force creation of new experiment dir", type=click.Path())
 def evaluate(config, train_dir, weights, evaluation_dir):
-    config = load_config(config)
+    config, _, global_batch_size, _, _, _, weights = parse_config(config, weights=weights)
     # Switch off multi-output for the evaluation for backwards compatibility
     config["setup"]["multi_output"] = False
 
@@ -200,11 +182,6 @@ def evaluate(config, train_dir, weights, evaluation_dir):
         eval_dir = evaluation_dir
     Path(eval_dir).mkdir(parents=True, exist_ok=True)
 
-    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
-
-    if weights is None:
-        weights = config["setup"]["weights"]
-
     if config["setup"]["dtype"] == "float16":
         model_dtype = tf.dtypes.float16
         policy = mixed_precision.Policy("mixed_float16")
@@ -216,7 +193,6 @@ def evaluate(config, train_dir, weights, evaluation_dir):
     dataset_def = get_dataset_def(config)
     X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def)
 
-    global_batch_size = config["setup"]["batch_size"]
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if maybe_global_batch_size is not None:
         global_batch_size = maybe_global_batch_size
@@ -244,15 +220,9 @@ def evaluate(config, train_dir, weights, evaluation_dir):
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-o", "--outdir", help="output directory", type=click.Path(), default=".")
 @click.option("-n", "--figname", help="name of saved figure", type=click.Path(), default="lr_finder.jpg")
-@click.option("-l", "--log_scale", help="use log scale on y-axis in figure", type=click.Path(), default=False, is_flag=True)
-def find_lr(config, outdir, figname, log_scale):
-    config = load_config(config)
-    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
-    global_batch_size = config["setup"]["batch_size"]
-
-    if "multi_output" not in config["setup"]:
-        config["setup"]["multi_output"] = True
-    n_train = config["setup"]["num_events_train"]
+@click.option("-l", "--logscale", help="use log scale on y-axis in figure", default=False, is_flag=True)
+def find_lr(config, outdir, figname, logscale):
+    config, _, global_batch_size, n_train, _, _, _ = parse_config(config)
     ds_train_r, _, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test=0)
 
     # Decide tf.distribute.strategy depending on number of available GPUs
@@ -309,7 +279,7 @@ def find_lr(config, outdir, figname, log_scale):
             steps_per_epoch=1,
         )
 
-        lr_finder.plot(save_dir=outdir, figname=figname, log_scale=log_scale)
+        lr_finder.plot(save_dir=outdir, figname=figname, log_scale=logscale)
 
 
 if __name__ == "__main__":
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 07ed8bb50..9f97f9024 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -20,6 +20,30 @@ def load_config(config_file_path):
     return cfg
 
 
+def parse_config(config, ntrain=None, ntest=None, weights=None):
+    config_file_stem = Path(config).stem
+    config = load_config(config)
+    tf.config.run_functions_eagerly(config["tensorflow"]["eager"])
+    global_batch_size = config["setup"]["batch_size"]
+    n_epochs = config["setup"]["num_epochs"]
+    if ntrain:
+        n_train = ntrain
+    else:
+        n_train = config["setup"]["num_events_train"]
+    if ntest:
+        n_test = ntest
+    else:
+        n_test = config["setup"]["num_events_test"]
+
+    if "multi_output" not in config["setup"]:
+        config["setup"]["multi_output"] = True
+
+    if weights is None:
+        weights = config["setup"]["weights"]
+
+    return config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights
+
+
 def create_experiment_dir(prefix=None, suffix=None):
     if prefix is None:
         train_dir = Path("experiments") / datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

From b6075d27a3e32d525dc91db93f4cf524cfa2b40c Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 1 Jul 2021 15:54:07 +0200
Subject: [PATCH 12/23] feat: evaluate loads best weights in the
 <train_dir>/weights/

When running `python mlpf/pipeline.py evaluate -t <train_dir>` without
specifying which weights to use explicitly the pipeline will load the
weights with the smallest loss in <train_dir>/weights/ that it can find.
---
 mlpf/pipeline.py      | 5 +++++
 mlpf/tfmodel/utils.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ba290d44e..c18be507b 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -41,6 +41,7 @@
     set_config_loss,
     get_loss_dict,
     parse_config,
+    get_best_checkpoint,
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -208,6 +209,10 @@ def evaluate(config, train_dir, weights, evaluation_dir):
             # need to load the weights in the same trainable configuration as the model was set up
             configure_model_weights(model, config["setup"].get("weights_config", "all"))
             model.load_weights(weights, by_name=True)
+        else:
+            weights = get_best_checkpoint(train_dir)
+            print("Loading best weights that could be found from {}".format(weights))
+            model.load_weights(weights, by_name=True)
         model(tf.cast(X_val[:1], model_dtype))
 
         model.compile()
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 9f97f9024..c074724b3 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -7,6 +7,7 @@
 import glob
 import numpy as np
 from tqdm import tqdm
+import re
 
 import tensorflow as tf
 
@@ -57,6 +58,14 @@ def create_experiment_dir(prefix=None, suffix=None):
     return str(train_dir)
 
 
+def get_best_checkpoint(train_dir):
+    checkpoint_list = list(Path(Path(train_dir) / "weights").glob("weights*.hdf5"))
+    # Sort the checkpoints according to the loss in their filenames
+    checkpoint_list.sort(key=lambda x: float(re.search("\d+-\d+.\d+", str(x))[0].split("-")[-1]))
+    # Return the checkpoint with smallest loss
+    return str(checkpoint_list[0])
+
+
 def get_strategy(global_batch_size):
     try:
         gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]

From 73dcb016b1a62a3d2b71269d1489804eab706283 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 2 Jul 2021 13:04:46 +0200
Subject: [PATCH 13/23] fix: Bugfix in loading of val data

---
 mlpf/pipeline.py                       | 6 +++---
 mlpf/tfmodel/utils.py                  | 2 +-
 parameters/cms-gnn-dense-onecycle.yaml | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index c18be507b..d08d10d21 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -68,7 +68,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
-    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=True)
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
     if recreate or (weights is None):
         outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
@@ -192,7 +192,7 @@ def evaluate(config, train_dir, weights, evaluation_dir):
         model_dtype = tf.dtypes.float32
 
     dataset_def = get_dataset_def(config)
-    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def)
+    X_val, ygen_val, ycand_val = prepare_val_data(config, dataset_def, single_file=False)
 
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
     if maybe_global_batch_size is not None:
@@ -238,7 +238,7 @@ def find_lr(config, outdir, figname, logscale):
         global_batch_size = maybe_global_batch_size
 
     dataset_def = get_dataset_def(config)
-    X_val, _, _ = prepare_val_data(config, dataset_def)
+    X_val, _, _ = prepare_val_data(config, dataset_def, single_file=True)
 
     with strategy.scope():
         opt = tf.keras.optimizers.Adam(learning_rate=1e-7)  # This learning rate will be changed by the lr_finder
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index c074724b3..d518fb2c8 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -231,7 +231,7 @@ def prepare_val_data(config, dataset_def, single_file=False):
     Xs = []
     ygens = []
     ycands = []
-    for fi in tqdm(val_filelist[:1], desc="Preparing validation data"):
+    for fi in tqdm(val_filelist, desc="Preparing validation data"):
         X, ygen, ycand = dataset_def.prepare_data(fi)
         Xs.append(np.concatenate(X))
         ygens.append(np.concatenate(ygen))
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index f194d2432..b44e197e5 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -29,7 +29,7 @@ dataset:
   eta_loss_coef: 0.1
   sin_phi_loss_coef: 1.0
   cos_phi_loss_coef: 1.0
-  energy_loss_coef: 0.1
+  energy_loss_coef: 1.0
   raw_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/*.pkl.bz2
   processed_path: ../data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr_cand/*.tfrecords
   num_files_per_chunk: 1

From 5147dde4cefdb878d79439e71f6b1a14b9a6c9d7 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 2 Jul 2021 15:16:40 +0200
Subject: [PATCH 14/23] fix: Bug in path handling

---
 mlpf/tfmodel/model_setup.py      | 2 +-
 scripts/local_test_cms_tf.sh     | 4 ++--
 scripts/local_test_delphes_tf.sh | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 52aae79a6..a5797d717 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -527,7 +527,7 @@ def main(args, yaml_path, config):
             print("Output directory exists: {}".format(outdir), file=sys.stderr)
             sys.exit(1)
     else:
-        outdir = os.path.dirname(weights)
+        outdir = str(Path(weights).parent.parent)
 
     try:
         gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]
diff --git a/scripts/local_test_cms_tf.sh b/scripts/local_test_cms_tf.sh
index 02b3cd9d6..877cc2595 100755
--- a/scripts/local_test_cms_tf.sh
+++ b/scripts/local_test_cms_tf.sh
@@ -38,9 +38,9 @@ python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action data
 python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action train 
 
 #Generate the pred.npz file of predictions
-python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action eval --weights ./experiments/test-cms-*/weights-01-*.hdf5
+python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action eval --weights ./experiments/test-cms-*/weights/weights-01-*.hdf5
 
 python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
 
 python3 mlpf/launcher.py --model-spec parameters/test-cms-v2.yaml --action train 
-python3 mlpf/launcher.py --model-spec parameters/test-cms-v2.yaml --action eval --weights ./experiments/test-cms-v2-*/weights-01-*.hdf5
+python3 mlpf/launcher.py --model-spec parameters/test-cms-v2.yaml --action eval --weights ./experiments/test-cms-v2-*/weights/weights-01-*.hdf5
diff --git a/scripts/local_test_delphes_tf.sh b/scripts/local_test_delphes_tf.sh
index ef8fb0117..bb41f072c 100755
--- a/scripts/local_test_delphes_tf.sh
+++ b/scripts/local_test_delphes_tf.sh
@@ -23,8 +23,8 @@ python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action data
 python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action train
 
 #Generate the pred.npz file of predictions
-python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action eval --weights ./experiments/test-*/weights-01-*.hdf5
+python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action eval --weights ./experiments/test-*/weights/weights-01-*.hdf5
 
 #Generate the timing file
-python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action time --weights ./experiments/test-*/weights-01-*.hdf5
+python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action time --weights ./experiments/test-*/weights/weights-01-*.hdf5
 

From 4b3171742ad2c3a03bb06bfe73a074b56e35e4f8 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 2 Jul 2021 16:29:59 +0200
Subject: [PATCH 15/23] feat: Add tests of pipeline for cms and delphes

---
 .github/workflows/test.yml             | 28 +++++++++++++++-
 mlpf/pipeline.py                       |  2 +-
 scripts/local_test_cms_pipeline.sh     | 46 ++++++++++++++++++++++++++
 scripts/local_test_delphes_pipeline.sh | 30 +++++++++++++++++
 4 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100755 scripts/local_test_cms_pipeline.sh
 create mode 100755 scripts/local_test_delphes_pipeline.sh

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 386083be0..e870ef5d2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,7 +34,33 @@ jobs:
           sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm
       - name: Run CMS TF model
         run: ./scripts/local_test_cms_tf.sh
-  
+
+  delphes-pipeline:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install python deps
+        run: |
+          sudo apt install python3 python3-pip wget
+          sudo python3 -m pip install --upgrade pip
+          sudo python3 -m pip install --upgrade setuptools
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+      - name: Run delphes TF model
+        run: ./scripts/local_test_delphes_pipeline.sh
+
+  cms-pipeline:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install python deps
+        run: |
+          sudo apt install python3 python3-pip wget
+          sudo python3 -m pip install --upgrade pip
+          sudo python3 -m pip install --upgrade setuptools
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+      - name: Run CMS TF model using the pipeline
+        run: ./scripts/local_test_cms_pipeline.sh
+
   delphes-pytorch:
     runs-on: ubuntu-latest
     steps:
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index d08d10d21..2114758a5 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -217,7 +217,7 @@ def evaluate(config, train_dir, weights, evaluation_dir):
 
         model.compile()
         eval_model(X_val, ygen_val, ycand_val, model, config, eval_dir, global_batch_size)
-        freeze_model(model, config, eval_dir)
+        freeze_model(model, config, train_dir)
 
 
 @main.command()
diff --git a/scripts/local_test_cms_pipeline.sh b/scripts/local_test_cms_pipeline.sh
new file mode 100755
index 000000000..2f10ec7e2
--- /dev/null
+++ b/scripts/local_test_cms_pipeline.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi
+
+mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/root
+cd data/TTbar_14TeV_TuneCUETP8M1_cfi/root
+
+#Only CMS-internal use is permitted by CMS rules
+wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root
+wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_2.root
+wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_3.root
+
+cd ../../..
+
+#Create the ntuples
+rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
+mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/raw
+for file in `\ls -1 data/TTbar_14TeV_TuneCUETP8M1_cfi/root/*.root`; do
+	python3 mlpf/data/postprocessing2.py \
+	  --input $file \
+	  --outpath data/TTbar_14TeV_TuneCUETP8M1_cfi/raw \
+	  --save-normalized-table --events-per-file 5
+done
+
+#Set aside some data for validation
+mkdir -p data/TTbar_14TeV_TuneCUETP8M1_cfi/val
+mv data/TTbar_14TeV_TuneCUETP8M1_cfi/raw/pfntuple_3_0.pkl data/TTbar_14TeV_TuneCUETP8M1_cfi/val/
+
+mkdir -p experiments
+rm -Rf experiments/test-*
+
+#Run a simple training on a few events
+rm -Rf data/TTbar_14TeV_TuneCUETP8M1_cfi/tfr
+python3 mlpf/launcher.py --model-spec parameters/test-cms.yaml --action data
+
+#Run a simple training on a few events
+python3 mlpf/pipeline.py train -c parameters/test-cms.yaml -p test-cms-
+
+#Generate the pred.npz file of predictions
+python3 mlpf/pipeline.py evaluate -c parameters/test-cms.yaml -t ./experiments/test-cms-*
+
+python3 scripts/test_load_tfmodel.py ./experiments/test-cms-*/model_frozen/frozen_graph.pb
+
+python3 mlpf/pipeline.py train -c parameters/test-cms-v2.yaml -p test-cms-v2-
+python3 mlpf/pipeline.py evaluate -c parameters/test-cms-v2.yaml -t ./experiments/test-cms-v2-*
diff --git a/scripts/local_test_delphes_pipeline.sh b/scripts/local_test_delphes_pipeline.sh
new file mode 100755
index 000000000..3117f8033
--- /dev/null
+++ b/scripts/local_test_delphes_pipeline.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+mkdir -p data/pythia8_ttbar
+mkdir -p data/pythia8_ttbar/val
+cd data/pythia8_ttbar
+
+#download a test input file (you can also download everything from Zenodo at 10.5281/zenodo.4559324)
+wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_0.pkl.bz2
+wget -q --no-check-certificate -nc https://zenodo.org/record/4559324/files/tev14_pythia8_ttbar_0_1.pkl.bz2
+mv tev14_pythia8_ttbar_0_1.pkl.bz2 val/
+
+cd ../..
+
+mkdir -p experiments
+rm -Rf experiments/test-*
+
+#Run a simple training on a few events
+rm -Rf data/pythia8_ttbar/tfr
+python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action data
+
+#Run a simple training on a few events
+python3 mlpf/pipeline.py train -c parameters/test-delphes.yaml -p test-delphes-
+
+#Generate the pred.npz file of predictions
+python3 mlpf/pipeline.py evaluate -c parameters/test-delphes.yaml -t ./experiments/test-delphes-*
+
+#Generate the timing file
+python3 mlpf/launcher.py --model-spec parameters/test-delphes.yaml --action time --weights ./experiments/test-delphes-*/weights/weights-01-*.hdf5
+

From c32694726a93aae6371848330da5f28372676b63 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 5 Jul 2021 09:58:41 +0200
Subject: [PATCH 16/23] feat: Add command to delete all but best chekpoint
 weights

This can be useful when many large checkpoint files take up too much storage space.
---
 mlpf/pipeline.py      | 18 +++++++++++++++---
 mlpf/tfmodel/utils.py | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 2114758a5..a1ad93586 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -42,11 +42,13 @@
     get_loss_dict,
     parse_config,
     get_best_checkpoint,
+    delete_all_but_best_checkpoint,
 )
 
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.lr_finder import LRFinder
 
+
 @click.group()
 @click.help_option("-h", "--help")
 def main():
@@ -62,9 +64,10 @@ def main():
 @click.option("-r", "--recreate", help="force creation of new experiment dir", is_flag=True)
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
 def train(config, weights, ntrain, ntest, recreate, prefix):
-    """Train a model defined by config
-    """
-    config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(config, ntrain, ntest, weights)
+    """Train a model defined by config"""
+    config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
+        config, ntrain, ntest, weights
+    )
 
     dataset_def = get_dataset_def(config)
     ds_train_r, ds_test_r, dataset_transform = get_train_val_datasets(config, global_batch_size, n_train, n_test)
@@ -287,5 +290,14 @@ def find_lr(config, outdir, figname, logscale):
         lr_finder.plot(save_dir=outdir, figname=figname, log_scale=logscale)
 
 
+@main.command()
+@click.help_option("-h", "--help")
+@click.option("-t", "--train_dir", help="training directory", type=click.Path())
+@click.option("-d", "--dry_run", help="do not delete anything", is_flag=True, default=False)
+def delete_all_but_best_ckpt(train_dir, dry_run):
+    """Delete all checkpoint weights in <train_dir>/weights/ except the one with lowest loss in its filename."""
+    delete_all_but_best_checkpoint(train_dir, dry_run)
+
+
 if __name__ == "__main__":
     main()
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index d518fb2c8..26bde655c 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -10,6 +10,7 @@
 import re
 
 import tensorflow as tf
+import tensorflow_addons as tfa
 
 from tfmodel.data import Dataset
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
@@ -66,6 +67,24 @@ def get_best_checkpoint(train_dir):
     return str(checkpoint_list[0])
 
 
+def delete_all_but_best_checkpoint(train_dir, dry_run):
+    checkpoint_list = list(Path(Path(train_dir) / "weights").glob("weights*.hdf5"))
+    # Don't remove the checkpoint with smallest loss
+    if len(checkpoint_list) == 1:
+        raise UserWarning("There is only one checkpoint. No deletion was made.")
+    elif len(checkpoint_list) == 0:
+        raise UserWarning("Couldn't find ant checkpoints. No deletion was made.")
+    else:
+        # Sort the checkpoints according to the loss in their filenames
+        checkpoint_list.sort(key=lambda x: float(re.search("\d+-\d+.\d+", str(x))[0].split("-")[-1]))
+        best_ckpt = checkpoint_list.pop(0)
+        for ckpt in checkpoint_list:
+            if not dry_run:
+                ckpt.unlink()
+
+        print("Removed all checkpoints in {} except {}".format(train_dir, best_ckpt))
+
+
 def get_strategy(global_batch_size):
     try:
         gpus = [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")]

From fa71baac70d2c833c278177e801850fc1c442b50 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 5 Jul 2021 10:05:08 +0200
Subject: [PATCH 17/23] fix: Use MeanSquaredLogarithmicError for pt and energy
 in OneCycle config

---
 parameters/cms-gnn-dense-onecycle.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index b44e197e5..ce1c86714 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -23,6 +23,8 @@ dataset:
   padded_num_elem_size: 6400
   #(pt, eta, sin phi, cos phi, E)
   num_momentum_outputs: 5
+  pt_loss: MeanSquaredLogarithmicError
+  energy_loss: MeanSquaredLogarithmicError
   classification_loss_coef: 1.0
   charge_loss_coef: 0.1
   pt_loss_coef: 1.0

From 456847c839acd3878e8ccbd4dd915d546b54c37a Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Mon, 5 Jul 2021 10:07:36 +0200
Subject: [PATCH 18/23] chore: Add description to find-lr command

---
 mlpf/pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index a1ad93586..f287b347f 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -174,8 +174,9 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 @click.option("-t", "--train_dir", required=True, help="directory containing a completed training", type=click.Path())
 @click.option("-c", "--config", help="configuration file", type=click.Path())
 @click.option("-w", "--weights", default=None, help="trained weights to load", type=click.Path())
-@click.option("-e", "--evaluation_dir", help="force creation of new experiment dir", type=click.Path())
+@click.option("-e", "--evaluation_dir", help="optionally specify evaluation output dir", type=click.Path())
 def evaluate(config, train_dir, weights, evaluation_dir):
+    """Evaluate the trained model in train_dir"""
     config, _, global_batch_size, _, _, _, weights = parse_config(config, weights=weights)
     # Switch off multi-output for the evaluation for backwards compatibility
     config["setup"]["multi_output"] = False
@@ -230,6 +231,8 @@ def evaluate(config, train_dir, weights, evaluation_dir):
 @click.option("-n", "--figname", help="name of saved figure", type=click.Path(), default="lr_finder.jpg")
 @click.option("-l", "--logscale", help="use log scale on y-axis in figure", default=False, is_flag=True)
 def find_lr(config, outdir, figname, logscale):
+    """Run the Learning Rate Finder to produce a batch loss vs. LR plot from
+    which an appropriate LR-range can be determined"""
     config, _, global_batch_size, n_train, _, _, _ = parse_config(config)
     ds_train_r, _, _ = get_train_val_datasets(config, global_batch_size, n_train, n_test=0)
 

From 79c8f95b5fcf9c1dc658704b6f3ccaf8c0e1783d Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 8 Jul 2021 11:53:11 +0200
Subject: [PATCH 19/23] feat: Add ability to configure expdecay parameters in
 config

The default parameters for expdecay added to the config files
in this commit are the same as those used on the
jpata/particleflow master branch at the time of writing.
---
 mlpf/tfmodel/utils.py                         | 8 +++++---
 parameters/cms-gnn-dense-big.yaml             | 5 +++++
 parameters/cms-gnn-dense-focal.yaml           | 5 +++++
 parameters/cms-gnn-dense-onecycle.yaml        | 5 +++++
 parameters/cms-gnn-dense-transfer.yaml        | 5 +++++
 parameters/cms-gnn-dense.yaml                 | 5 +++++
 parameters/cms-gnn-skipconn-v2.yaml           | 5 +++++
 parameters/cms-gnn-skipconn.yaml              | 5 +++++
 parameters/cms-transformer-skipconn-gun.yaml  | 5 +++++
 parameters/cms-transformer-skipconn.yaml      | 5 +++++
 parameters/delphes-gnn-skipconn-onecycle.yaml | 5 +++++
 parameters/delphes-gnn-skipconn.yaml          | 5 +++++
 parameters/delphes-transformer-skipconn.yaml  | 5 +++++
 parameters/test-cms-v2.yaml                   | 5 +++++
 parameters/test-cms.yaml                      | 5 +++++
 parameters/test-delphes.yaml                  | 5 +++++
 16 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index 26bde655c..d0cf0655d 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -127,10 +127,12 @@ def get_lr_schedule(config, lr, steps):
     elif schedule == "exponentialdecay":
         lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
             lr,
-            decay_steps=steps,
-            decay_rate=0.99,
-            staircase=True,
+            decay_steps=config["exponentialdecay"]["decay_steps"],
+            decay_rate=config["exponentialdecay"]["decay_rate"],
+            staircase=config["exponentialdecay"]["staircase"],
         )
+    else:
+        raise ValueError("Only supported LR schedules are 'exponentialdecay' and 'onecycle'.")
     return lr_schedule, callbacks
 
 
diff --git a/parameters/cms-gnn-dense-big.yaml b/parameters/cms-gnn-dense-big.yaml
index 6c4c62059..aa3de4a6f 100644
--- a/parameters/cms-gnn-dense-big.yaml
+++ b/parameters/cms-gnn-dense-big.yaml
@@ -69,3 +69,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gnn-dense-focal.yaml b/parameters/cms-gnn-dense-focal.yaml
index 15ce45126..e23a9bc7a 100644
--- a/parameters/cms-gnn-dense-focal.yaml
+++ b/parameters/cms-gnn-dense-focal.yaml
@@ -70,3 +70,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index ce1c86714..9a54fd4f8 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -73,6 +73,11 @@ timing:
   num_ev: 100
   num_iter: 3
 
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
+
 onecycle:
   mom_min: 0.85
   mom_max: 0.95
diff --git a/parameters/cms-gnn-dense-transfer.yaml b/parameters/cms-gnn-dense-transfer.yaml
index 688922a84..8b735f859 100644
--- a/parameters/cms-gnn-dense-transfer.yaml
+++ b/parameters/cms-gnn-dense-transfer.yaml
@@ -69,3 +69,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gnn-dense.yaml b/parameters/cms-gnn-dense.yaml
index ab0087373..355243371 100644
--- a/parameters/cms-gnn-dense.yaml
+++ b/parameters/cms-gnn-dense.yaml
@@ -71,3 +71,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gnn-skipconn-v2.yaml b/parameters/cms-gnn-skipconn-v2.yaml
index 0bb9c9220..e69919342 100644
--- a/parameters/cms-gnn-skipconn-v2.yaml
+++ b/parameters/cms-gnn-skipconn-v2.yaml
@@ -74,3 +74,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-gnn-skipconn.yaml b/parameters/cms-gnn-skipconn.yaml
index 1f23797b7..b1d2e50f0 100644
--- a/parameters/cms-gnn-skipconn.yaml
+++ b/parameters/cms-gnn-skipconn.yaml
@@ -74,3 +74,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-transformer-skipconn-gun.yaml b/parameters/cms-transformer-skipconn-gun.yaml
index 180cb513d..f1fdd39e9 100644
--- a/parameters/cms-transformer-skipconn-gun.yaml
+++ b/parameters/cms-transformer-skipconn-gun.yaml
@@ -67,3 +67,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/cms-transformer-skipconn.yaml b/parameters/cms-transformer-skipconn.yaml
index f8ea796b9..0cb6eeb31 100644
--- a/parameters/cms-transformer-skipconn.yaml
+++ b/parameters/cms-transformer-skipconn.yaml
@@ -65,3 +65,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/delphes-gnn-skipconn-onecycle.yaml b/parameters/delphes-gnn-skipconn-onecycle.yaml
index b398f10c1..7972a524c 100644
--- a/parameters/delphes-gnn-skipconn-onecycle.yaml
+++ b/parameters/delphes-gnn-skipconn-onecycle.yaml
@@ -71,6 +71,11 @@ timing:
   num_ev: 100
   num_iter: 3
 
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
+
 onecycle:
   mom_min: 0.85
   mom_max: 0.95
diff --git a/parameters/delphes-gnn-skipconn.yaml b/parameters/delphes-gnn-skipconn.yaml
index d73066042..5b0ae1fa8 100644
--- a/parameters/delphes-gnn-skipconn.yaml
+++ b/parameters/delphes-gnn-skipconn.yaml
@@ -64,3 +64,8 @@ parameters:
 timing:
   num_ev: 100
   num_iter: 3
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/delphes-transformer-skipconn.yaml b/parameters/delphes-transformer-skipconn.yaml
index 9f3113c34..e2d435936 100644
--- a/parameters/delphes-transformer-skipconn.yaml
+++ b/parameters/delphes-transformer-skipconn.yaml
@@ -50,3 +50,8 @@ parameters:
   support: 32
   skip_connection: yes
   dropout: 0.2
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/test-cms-v2.yaml b/parameters/test-cms-v2.yaml
index 763b5e0ca..e5fe3528c 100644
--- a/parameters/test-cms-v2.yaml
+++ b/parameters/test-cms-v2.yaml
@@ -57,3 +57,8 @@ parameters:
 timing:
   num_ev: 1
   num_iter: 1
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/test-cms.yaml b/parameters/test-cms.yaml
index 175c84686..8f55d8d67 100644
--- a/parameters/test-cms.yaml
+++ b/parameters/test-cms.yaml
@@ -62,3 +62,8 @@ parameters:
 timing:
   num_ev: 1
   num_iter: 1
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes
diff --git a/parameters/test-delphes.yaml b/parameters/test-delphes.yaml
index 1c36387a5..e93d50038 100644
--- a/parameters/test-delphes.yaml
+++ b/parameters/test-delphes.yaml
@@ -61,3 +61,8 @@ parameters:
 timing:
   num_ev: 1
   num_iter: 1
+
+exponentialdecay:
+  decay_steps: 10000
+  decay_rate: 0.99
+  staircase: yes

From b1b6e2ea18dbb627e1beb5fcc4264b9854b9aae1 Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Fri, 9 Jul 2021 14:18:45 +0200
Subject: [PATCH 20/23] Make eval_model() work for multi-output

Also:
- Add missing parameters to config files.
- Move make_weights_function to utils.py
---
 mlpf/pipeline.py                              |  2 +-
 mlpf/tfmodel/model_setup.py                   | 34 +++----------
 mlpf/tfmodel/utils.py                         | 48 +++++++++++++------
 parameters/cms-gnn-dense-onecycle.yaml        | 15 +++++-
 parameters/delphes-gnn-skipconn-onecycle.yaml | 10 +++-
 parameters/delphes-gnn-skipconn.yaml          | 10 +++-
 parameters/delphes-transformer-skipconn.yaml  | 10 +++-
 7 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index f287b347f..157f3d542 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -30,7 +30,7 @@
     get_lr_schedule,
     create_experiment_dir,
     get_strategy,
-    get_weights_func,
+    make_weight_function,
     load_config,
     compute_weights_invsqrt,
     compute_weights_none,
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 5cc166cf7..ee29b95af 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -26,7 +26,7 @@
 from pathlib import Path
 from tfmodel.onecycle_scheduler import OneCycleScheduler, MomentumOneCycleScheduler
 from tfmodel.callbacks import CustomTensorBoard
-from tfmodel.utils import get_lr_schedule, get_weights_func, targets_multi_output
+from tfmodel.utils import get_lr_schedule, make_weight_function, targets_multi_output
 
 
 from tensorflow.keras.metrics import Recall, CategoricalAccuracy
@@ -231,31 +231,6 @@ def get_rundir(base='experiments'):
     logdir = 'run_%02d' % run_number
     return '{}/{}'.format(base, logdir)
 
-def make_weight_function(config):
-    def weight_func(X,y,w):
-
-        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
-        w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
-
-        w_none = tf.ones_like(w)
-        w_none *= tf.cast(X[:, 0]!=0, tf.float32)
-
-        w_invsqrt = tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w)
-        w_invsqrt *= tf.cast(X[:, 0]!=0, tf.float32)
-
-        weight_d = {
-            "none": w_none,
-            "signal_only": w_signal_only,
-            "inverse_sqrt": w_invsqrt
-        }
-
-        ret_w = {}
-        for loss_component, weight_type in config["sample_weights"].items():
-            ret_w[loss_component] = weight_d[weight_type]
-
-        return X,y,ret_w
-    return weight_func
-
 
 def scale_outputs(X,y,w):
     ynew = y-out_m
@@ -385,7 +360,12 @@ def eval_model(X, ygen, ycand, model, config, outdir, global_batch_size):
 
         y_pred_id = np.argmax(y_pred_raw_ids, axis=-1)
 
-        y_pred_id = np.concatenate([np.expand_dims(y_pred_id, axis=-1), y_pred[:, :, config["dataset"]["num_output_classes"]:]], axis=-1)
+        if type(y_pred) is dict:
+            y_pred_rest = np.concatenate([y_pred["charge"], y_pred["pt"], y_pred["eta"], y_pred["sin_phi"], y_pred["cos_phi"], y_pred["energy"]], axis=-1)
+            y_pred_id = np.concatenate([np.expand_dims(y_pred_id, axis=-1), y_pred_rest], axis=-1)
+        else:
+            y_pred_id = np.concatenate([np.expand_dims(y_pred_id, axis=-1), y_pred[:, :, config["dataset"]["num_output_classes"]:]], axis=-1)
+
         np_outfile = "{}/pred_batch{}.npz".format(outdir, ibatch)
         np.savez(
             np_outfile,
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index d0cf0655d..909e8c2f3 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -149,14 +149,30 @@ def compute_weights_none(X, y, w):
     return X, y, wn
 
 
-def get_weights_func(config):
-    sampling = config["setup"]["sample_weights"]
-    if sampling == "inverse_sqrt":
-        return compute_weights_invsqrt
-    elif sampling == "none":
-        return compute_weights_none
-    else:
-        raise ValueError("Only supported weight samplings are 'inverse_sqrt' and 'none'.")
+def make_weight_function(config):
+    def weight_func(X,y,w):
+
+        w_signal_only = tf.where(y[:, 0]==0, 0.0, 1.0)
+        w_signal_only *= tf.cast(X[:, 0]!=0, tf.float32)
+
+        w_none = tf.ones_like(w)
+        w_none *= tf.cast(X[:, 0]!=0, tf.float32)
+
+        w_invsqrt = tf.cast(tf.shape(w)[-1], tf.float32)/tf.sqrt(w)
+        w_invsqrt *= tf.cast(X[:, 0]!=0, tf.float32)
+
+        weight_d = {
+            "none": w_none,
+            "signal_only": w_signal_only,
+            "inverse_sqrt": w_invsqrt
+        }
+
+        ret_w = {}
+        for loss_component, weight_type in config["sample_weights"].items():
+            ret_w[loss_component] = weight_d[weight_type]
+
+        return X,y,ret_w
+    return weight_func
 
 
 def targets_multi_output(num_output_classes):
@@ -211,18 +227,22 @@ def get_train_val_datasets(config, global_batch_size, n_train, n_test):
         num_events += 1
     print("dataset loaded, len={}".format(num_events))
 
-    weight_func = get_weights_func(config)
+    weight_func = make_weight_function(config)
     assert n_train + n_test <= num_events
 
     # Padded shapes
     ps = (
         tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_input_features]),
         tf.TensorShape([dataset_def.padded_num_elem_size, dataset_def.num_output_features]),
-        tf.TensorShape(
-            [
-                dataset_def.padded_num_elem_size,
-            ]
-        ),
+        {
+            "cls": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+            "charge": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+            "energy": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+            "pt": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+            "eta": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+            "sin_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+            "cos_phi": tf.TensorShape([dataset_def.padded_num_elem_size, ]),
+        }
     )
 
     ds_train = dataset.take(n_train).map(weight_func).padded_batch(global_batch_size, padded_shapes=ps)
diff --git a/parameters/cms-gnn-dense-onecycle.yaml b/parameters/cms-gnn-dense-onecycle.yaml
index 9a54fd4f8..ce6fcc2fb 100644
--- a/parameters/cms-gnn-dense-onecycle.yaml
+++ b/parameters/cms-gnn-dense-onecycle.yaml
@@ -44,18 +44,26 @@ setup:
   train: yes
   weights:
   weights_config: all
-  lr: 1e-4
+  lr: 3e-4
   batch_size: 32
   num_events_train: 80000
   num_events_test: 10000
   num_epochs: 400
   num_val_files: 100
   dtype: float32
-  sample_weights: inverse_sqrt
   trainable: all
   classification_loss_type: categorical_cross_entropy  # categorical_cross_entropy, sigmoid_focal_crossentropy
   lr_schedule: onecycle  # exponentialdecay, onecycle
 
+sample_weights:
+  cls: inverse_sqrt
+  charge: signal_only
+  pt: signal_only
+  eta: signal_only
+  sin_phi: signal_only
+  cos_phi: signal_only
+  energy: signal_only
+
 parameters:
   model: gnn_dense
   activation: elu
@@ -68,6 +76,9 @@ parameters:
   normalize_degrees: yes
   distance_dim: 128
   dropout: 0.0
+  separate_momentum: yes
+  input_encoding: cms
+  debug: no
 
 timing:
   num_ev: 100
diff --git a/parameters/delphes-gnn-skipconn-onecycle.yaml b/parameters/delphes-gnn-skipconn-onecycle.yaml
index 7972a524c..16259d6b6 100644
--- a/parameters/delphes-gnn-skipconn-onecycle.yaml
+++ b/parameters/delphes-gnn-skipconn-onecycle.yaml
@@ -43,12 +43,20 @@ setup:
   num_epochs: 250
   num_val_files: -1
   dtype: float32
-  sample_weights: none
   trainable: all
   multi_output: yes
   classification_loss_type: categorical_cross_entropy
   lr_schedule: onecycle  # exponentialdecay, onecycle
 
+sample_weights:
+  cls: none
+  charge: none
+  pt: none
+  eta: none
+  sin_phi: none
+  cos_phi: none
+  energy: none
+
 parameters:
   model: gnn
   bin_size: 128
diff --git a/parameters/delphes-gnn-skipconn.yaml b/parameters/delphes-gnn-skipconn.yaml
index 5b0ae1fa8..0f83160a2 100644
--- a/parameters/delphes-gnn-skipconn.yaml
+++ b/parameters/delphes-gnn-skipconn.yaml
@@ -37,12 +37,20 @@ setup:
   num_epochs: 400
   num_val_files: -1
   dtype: float32
-  sample_weights: none
   trainable: all
   multi_output: no
   classification_loss_type: categorical_cross_entropy
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
+sample_weights:
+  cls: none
+  charge: none
+  pt: none
+  eta: none
+  sin_phi: none
+  cos_phi: none
+  energy: none
+
 parameters:
   model: gnn
   bin_size: 128
diff --git a/parameters/delphes-transformer-skipconn.yaml b/parameters/delphes-transformer-skipconn.yaml
index e2d435936..9874e5289 100644
--- a/parameters/delphes-transformer-skipconn.yaml
+++ b/parameters/delphes-transformer-skipconn.yaml
@@ -36,11 +36,19 @@ setup:
   num_epochs: 300
   num_val_files: -1
   dtype: float16
-  sample_weights: none
   trainable: all
   multi_output: no
   lr_schedule: exponentialdecay  # exponentialdecay, onecycle
 
+sample_weights:
+  cls: none
+  charge: none
+  pt: none
+  eta: none
+  sin_phi: none
+  cos_phi: none
+  energy: none
+
 parameters:
   model: transformer
   num_layers: 4

From cd8ee25743c12aa80a1c32375a69723a73220d9f Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 22 Jul 2021 10:16:15 +0200
Subject: [PATCH 21/23] fix: Creation of history dir now using parents=True,
 exist_ok=True

---
 mlpf/tfmodel/model_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index ee29b95af..65bb4aec6 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -209,7 +209,7 @@ def prepare_callbacks(model, outdir, X_val, y_val, dataset_transform, num_output
     callbacks += [cp_callback]
 
     history_path = Path(outdir) / "history"
-    history_path.mkdir()
+    history_path.mkdir(parents=True, exist_ok=True)
     history_path = str(history_path)
     cb = CustomCallback(history_path, X_val, y_val, dataset_transform, num_output_classes)
     cb.set_model(model)

From e4c85d004627ac085645ddeb99c762b83dd0c5fa Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 22 Jul 2021 11:57:56 +0200
Subject: [PATCH 22/23] fix: Always configue model weights before loading saved
 weights

---
 mlpf/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 157f3d542..ebdcf9fbc 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -209,9 +209,9 @@ def evaluate(config, train_dir, weights, evaluation_dir):
         print(X_val.shape)
         model(tf.cast(X_val[:1], model_dtype))
 
+        # need to load the weights in the same trainable configuration as the model was set up
+        configure_model_weights(model, config["setup"].get("weights_config", "all"))
         if weights:
-            # need to load the weights in the same trainable configuration as the model was set up
-            configure_model_weights(model, config["setup"].get("weights_config", "all"))
             model.load_weights(weights, by_name=True)
         else:
             weights = get_best_checkpoint(train_dir)

From ac3c944eface397d4376114bc4201f70ee0bae2d Mon Sep 17 00:00:00 2001
From: Eric Wulff <eric.wulff@cern.ch>
Date: Thu, 22 Jul 2021 14:16:49 +0200
Subject: [PATCH 23/23] feat: Pipeline train copies config file to outdir for
 reference

---
 mlpf/pipeline.py            | 6 ++++++
 mlpf/tfmodel/model_setup.py | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index ebdcf9fbc..742e9b99b 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 import click
 from tqdm import tqdm
+import shutil
 
 import tensorflow as tf
 from tensorflow.keras import mixed_precision
@@ -65,6 +66,7 @@ def main():
 @click.option("-p", "--prefix", default="", help="prefix to put at beginning of training dir name", type=str)
 def train(config, weights, ntrain, ntest, recreate, prefix):
     """Train a model defined by config"""
+    config_file_path = config
     config, config_file_stem, global_batch_size, n_train, n_test, n_epochs, weights = parse_config(
         config, ntrain, ntest, weights
     )
@@ -77,6 +79,7 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
         outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node())
     else:
         outdir = str(Path(weights).parent)
+    shutil.copy(config_file_path, outdir + "/config.yaml")  # Copy the config file to the train dir for later reference
 
     # Decide tf.distribute.strategy depending on number of available GPUs
     strategy, maybe_global_batch_size = get_strategy(global_batch_size)
@@ -177,6 +180,9 @@ def train(config, weights, ntrain, ntest, recreate, prefix):
 @click.option("-e", "--evaluation_dir", help="optionally specify evaluation output dir", type=click.Path())
 def evaluate(config, train_dir, weights, evaluation_dir):
     """Evaluate the trained model in train_dir"""
+    if config is None:
+        config = Path(train_dir) / "config.yaml"
+        assert config.exists(), "Could not find config file in train_dir, please provide one with -c <path/to/config>"
     config, _, global_batch_size, _, _, _, weights = parse_config(config, weights=weights)
     # Switch off multi-output for the evaluation for backwards compatibility
     config["setup"]["multi_output"] = False
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 65bb4aec6..e2a7a4af4 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -16,7 +16,6 @@
 import matplotlib
 import matplotlib.pyplot as plt
 import sklearn
-import kerastuner as kt
 from argparse import Namespace
 import time
 import json