diff --git a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
index 165690a7a..2ec8fe882 100644
--- a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
+++ b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
@@ -1,13 +1,13 @@
 #!/bin/sh
 
 # Walltime limit
-#SBATCH -t 03:00:00
+#SBATCH -t 1-00:00:00
 #SBATCH -N 1
 #SBATCH --exclusive
 #SBATCH --tasks-per-node=1
 #SBATCH -p gpu
-#SBATCH --gpus 1
-#SBATCH --constraint=a100
+#SBATCH --gpus-per-task=1
+#SBATCH --constraint=a100-80gb,ib
 
 # Job name
 #SBATCH -J pipeeval
@@ -21,14 +21,26 @@ echo "#################### Job submission script. #############################"
 cat $0
 echo "################# End of job submission script. #########################"
 
-module --force purge; module load modules/1.49-20211101
-module load slurm gcc nccl cuda/11.3.1 cudnn/8.2.0.53-11.3 openmpi/4.0.6
+module --force purge; module load modules/2.1.1-20230405
+module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7
+
 nvidia-smi
 
 source ~/miniconda3/bin/activate tf2
 which python3
 python3 --version
 
+# make tensorflow find cupti (needed for profiling)
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/
+
+train_dir="experiments/hits_bs16_clic-hits_20230508_064411_129925_RESUMED2_clic-hits_20230522_170633_350485.workergpu064"
+
 echo 'Starting evaluation.'
-CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate \
+    --train-dir $train_dir
 echo 'Evaluation done.'
+
+echo 'Starting plotting.'
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py plots \
+    --train-dir $train_dir
+echo 'Plotting done.'
diff --git a/mlpf/flatiron/pipeline_train_4GPUs.slurm b/mlpf/flatiron/pipeline_train_4GPUs.slurm
index c8298c7ad..88ea5f711 100644
--- a/mlpf/flatiron/pipeline_train_4GPUs.slurm
+++ b/mlpf/flatiron/pipeline_train_4GPUs.slurm
@@ -1,13 +1,13 @@
 #!/bin/sh
 
 # Walltime limit
-#SBATCH -t 168:00:00
+#SBATCH -t 7-0:00:00
 #SBATCH -N 1
 #SBATCH --exclusive
 #SBATCH --tasks-per-node=1
 #SBATCH -p gpu
 #SBATCH --gpus-per-task=4
-#SBATCH --constraint=a100-80gb
+#SBATCH --constraint=a100-80gb,ib
 
 # Job name
 #SBATCH -J pipetrain
@@ -21,19 +21,26 @@ echo "#################### Job submission script. #############################"
 cat $0
 echo "################# End of job submission script. #########################"
 
-# module --force purge; module load modules/2.1-alpha1
-# module load slurm gcc/11.3.0 nccl cuda/11.8.0 cudnn/8.4.0.27-11.6
-module --force purge; module load modules/2.0-20220630
-module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7
+# module --force purge; module load modules/2.0-20220630
+# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7
+
+module --force purge; module load modules/2.1.1-20230405
+module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7
+
 nvidia-smi
 
 source ~/miniconda3/bin/activate tf2
 which python3
 python3 --version
 
+# make tensorflow find cupti (needed for profiling)
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/
+
+export TF_GPU_THREAD_MODE=gpu_private
+export TF_GPU_THREAD_COUNT=2
 
 echo 'Starting training.'
 # Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2 \
-    --seeds --comet-exp-name particleflow-tf-gen
+    --seeds --comet-exp-name particleflow-tf-clic
 echo 'Training done.'
diff --git a/mlpf/flatiron/pipeline_train_8GPUs.slurm b/mlpf/flatiron/pipeline_train_8GPUs.slurm
new file mode 100644
index 000000000..7474edba3
--- /dev/null
+++ b/mlpf/flatiron/pipeline_train_8GPUs.slurm
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 2-00:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus-per-task=8
+#SBATCH --constraint=h100,ib
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+# module --force purge; module load modules/2.0-20220630
+# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7
+
+module --force purge; module load modules/2.1.1-20230405
+module load slurm gcc cmake nccl cuda/12.0.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7
+
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 8 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 mlpf/pipeline.py train -c $1 -p $2 \
+    --seeds --comet-exp-name particleflow-tf-clic
+echo 'Training done.'
diff --git a/mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm b/mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm
new file mode 100644
index 000000000..38aad26cd
--- /dev/null
+++ b/mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 0-04:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus-per-task=8
+#SBATCH --constraint=h100,ib
+# #SBATCH --mem 256G
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+export MODULEPATH=/mnt/home/gkrawezik/modules/rocky8:$MODULEPATH
+module load cuda/12 cudnn/cuda12 nccl/cuda12 singularity  # these names are specific to gkrawezik's modules
+nvidia-smi
+
+# ensure CPU is keeping private threads for scheduling operations on the GPUs
+# https://www.tensorflow.org/guide/gpu_performance_analysis#2_gpu_host_thread_contention
+export TF_GPU_THREAD_MODE=gpu_private
+export TF_GPU_THREAD_COUNT=2
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+nvidia-smi
+
+echo 'Starting training.'
+singularity run --nv -B /mnt/ceph/users/ewulff/tensorflow_datasets,/mnt/ceph/users/ewulff/particleflow \
+ tensorflow_23.05-tf2-py3.sif \
+ python3 $PWD/mlpf/pipeline.py train -c $1 -p $2 \
+ --seeds --comet-exp-name particleflow-tf-clic --benchmark_dir exp_dir
+echo 'Training done.'
diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
index 2dd709530..f8e1ad187 100755
--- a/mlpf/flatiron/raytune.sh
+++ b/mlpf/flatiron/raytune.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #SBATCH -t 168:00:00
-#SBATCH -N 12
+#SBATCH -N 8
 #SBATCH --tasks-per-node=1
 #SBATCH -p gpu
 #SBATCH --constraint=a100-80gb,ib
@@ -78,5 +78,5 @@ echo All Ray workers started.
 
 #### call your code below
 python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" \
-  --gpus $num_gpus --seeds --comet-exp-name particleflow-raytune
+  --gpus $num_gpus --seeds --comet-exp-name particleflow-raytune #  --comet-online
 exit
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 3ab0e2e32..239e984af 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -17,6 +17,7 @@
 from datetime import datetime
 from functools import partial
 from pathlib import Path
+import ctypes
 
 import boost_histogram as bh
 import click
@@ -168,6 +169,15 @@ def train(
 
     # tf.debugging.enable_check_numerics()
 
+    # Configure GPU threads according to TensorFlow's best practices for optimal model performance
+    os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
+    os.environ["TF_GPU_THREAD_COUNT"] = "2"
+
+    # According to TensorFlow's best practices for optimal model performance, set GPU memory growth to True
+    physical_devices = tf.config.list_physical_devices("GPU")
+    for pd in physical_devices:
+        tf.config.experimental.set_memory_growth(pd, True)
+
     if seeds:
         random.seed(1234)
         np.random.seed(1234)
@@ -261,6 +271,17 @@ def train(
         with strategy.scope():
             model, optim_callbacks, initial_epoch = model_scope(config, total_steps, weights)
 
+    if num_gpus > 0:
+        # According to TensorFlow's best practices for optimal model performance,
+        # max out the L2 fetch granularity to 128 bytes when using NVIDIA GPUs
+        _libcudart = ctypes.CDLL("libcudart.so")
+        # Set device limit on the current device
+        # cudaLimitMaxL2FetchGranularity = 0x05
+        pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+        _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+        _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
+        assert pValue.contents.value == 128
+
     with strategy.scope():
         callbacks = prepare_callbacks(
             config,
@@ -304,8 +325,8 @@ def train(
         model.normalizer.variance = tf.convert_to_tensor(cache["variance"])
 
         model.fit(
-            ds_train.tensorflow_dataset.repeat(),
-            validation_data=ds_test.tensorflow_dataset.repeat(),
+            ds_train.tensorflow_dataset.repeat().prefetch(tf.data.AUTOTUNE),
+            validation_data=ds_test.tensorflow_dataset.repeat().prefetch(tf.data.AUTOTUNE),
             epochs=config["setup"]["num_epochs"],
             callbacks=callbacks,
             steps_per_epoch=ds_train.num_steps(),
@@ -705,6 +726,8 @@ def raytune(
     from raytune.search_space import raytune_num_samples, search_space
     from raytune.utils import get_raytune_schedule, get_raytune_search_alg
 
+    os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1"  # don't crash if a metric is missing
+
     if seeds:
         # Set seeds for reproducibility
         random.seed(1234)
diff --git a/mlpf/raytune/search_space.py b/mlpf/raytune/search_space.py
index c77cc88b4..8509ce9b2 100644
--- a/mlpf/raytune/search_space.py
+++ b/mlpf/raytune/search_space.py
@@ -1,7 +1,7 @@
-from ray.tune import grid_search  # choice, loguniform, quniform
+from ray.tune import choice  # grid_search, choice, loguniform, quniform
 
-raytune_num_samples = 1  # Number of random samples to draw from search space. Set to 1 for grid search.
-samp = grid_search
+raytune_num_samples = 300  # Number of random samples to draw from search space. Set to 1 for grid search.
+samp = choice
 # search_space = {
 # Optimizer parameters
 # "lr": samp([1e-4, 1e-3]),
@@ -73,14 +73,48 @@
 #     # "mask_reg_cls0": choice([False, True]),
 # }
 
-# onecycle scan
+# # onecycle scan
+# search_space = {
+#     # "lr": samp([1e-4, 1e-3, 1e-2]),
+#     # "batch_size_physical": samp([24, 40]),
+#     "batch_multiplier": samp([1, 5, 10]),
+#     # "model": samp(["gnn_dense", "transformer"]),
+#     # "lr_schedule": samp(["none", "cosinedecay", "onecycle"]),
+#     # "optimizer": samp(["pcgrad_adam", "adam", "sgd"]),
+# }
+
+# transformer scan
+# search_space = {
+#     # optimizer parameters
+#     "lr": samp([1e-5, 1e-4, 1e-3]),
+#     "batch_multiplier": samp([10, 20, 40]),
+#     # model arch parameters
+#     "num_layers_encoder": samp([1, 2, 3, 4]),  # default is 1
+#     "num_layers_decoder_reg": samp([1, 2, 3, 4]),  # default is 1
+#     "num_layers_decoder_cls": samp([1, 2, 3, 4]),  # default is 1
+#     "hidden_dim": samp([32, 64, 128]),  # default is 64
+#     "num_heads": samp([8, 16, 32, 64]),  # default is 16
+#     "num_random_features": samp([16, 32, 64, 128]),  # default is 32
+#     # output_decoding parameters
+#     "out_hidden_dim": samp([128, 256, 512]),  # default is ~256
+#     "out_num_layers": samp([1, 2, 3, 4]),  # default is ~2
+# }
+
+# gnn scan
 search_space = {
-    # "lr": samp([1e-4, 1e-3, 1e-2]),
-    # "batch_size_physical": samp([24, 40]),
-    "batch_multiplier": samp([1, 5, 10]),
-    # "model": samp(["gnn_dense", "transformer"]),
-    # "lr_schedule": samp(["none", "cosinedecay", "onecycle"]),
-    # "optimizer": samp(["pcgrad_adam", "adam", "sgd"]),
+    # optimizer parameters
+    "lr": samp([1e-4, 1e-3, 1e-2]),
+    # "batch_multiplier": samp([10, 20, 40]),
+    # model arch parameters
+    "num_graph_layers_id": samp([1, 2, 3, 4, 5, 6]),
+    "num_graph_layers_reg": samp([1, 2, 3, 4, 5, 6]),
+    "bin_size": samp([16, 32, 64, 128, 256]),
+    "output_dim": samp([64, 128, 256]),
+    "ffn_dist_hidden_dim": samp([64, 128, 256]),
+    "ffn_dist_num_layers": samp([1, 2, 3, 4, 5]),
+    # output_decoding parameters
+    "out_hidden_dim": samp([64, 128, 256, 512]),
+    "out_num_layers": samp([1, 2, 3, 4, 5]),
 }
 
 
@@ -217,4 +251,18 @@ def set_raytune_search_parameters(search_space, config):
         config["parameters"]["output_decoding"]["phi_num_layers"] = search_space["out_num_layers"]
         config["parameters"]["output_decoding"]["energy_num_layers"] = search_space["out_num_layers"]
 
+    # transformer specific parameters
+    if "num_layers_encoder" in search_space.keys():
+        config["parameters"]["num_layers_encoder"] = search_space["num_layers_encoder"]
+    if "num_layers_decoder_reg" in search_space.keys():
+        config["parameters"]["num_layers_decoder_reg"] = search_space["num_layers_decoder_reg"]
+    if "num_layers_decoder_cls" in search_space.keys():
+        config["parameters"]["num_layers_decoder_cls"] = search_space["num_layers_decoder_cls"]
+    if "hidden_dim" in search_space.keys():
+        config["parameters"]["hidden_dim"] = search_space["hidden_dim"]
+    if "num_heads" in search_space.keys():
+        config["parameters"]["num_heads"] = search_space["num_heads"]
+    if "num_random_features" in search_space.keys():
+        config["parameters"]["num_random_features"] = search_space["num_random_features"]
+
     return config
diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py
index 95ddbc437..3b634fdbf 100644
--- a/mlpf/tfmodel/model_setup.py
+++ b/mlpf/tfmodel/model_setup.py
@@ -166,7 +166,7 @@ def prepare_callbacks(
             benchmark_dir = outdir
         if config["dataset"]["schema"] == "delphes":
             bmk_bs = config["train_test_datasets"]["delphes"]["batch_per_gpu"]
-        elif config["dataset"]["schema"] == "cms":
+        elif (config["dataset"]["schema"] == "cms") or (config["dataset"]["schema"] == "clic"):
             assert (
                 len(config["train_test_datasets"]) == 1
             ), "Expected exactly 1 key, physical OR delphes, \
@@ -176,8 +176,8 @@ def prepare_callbacks(
             bmk_bs = config["train_test_datasets"]["physical"]["batch_per_gpu"]
         else:
             raise ValueError(
-                "Benchmark callback only supports delphes or \
-                cms dataset schema. {}".format(
+                "Benchmark callback only supports delphes \
+                cms or clic dataset schema. {}".format(
                     config["dataset"]["schema"]
                 )
             )
@@ -232,7 +232,9 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h
         write_graph=False,
         write_images=False,
         update_freq="batch",
-        # profile_batch=(50,100),
+        profile_batch=config["callbacks"]["tensorboard"]["profile_batch"]
+        if "profile_batch" in config["callbacks"]["tensorboard"].keys()
+        else 0,
         dump_history=config["callbacks"]["tensorboard"]["dump_history"],
     )
     # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py
index aa382a7e1..dd95416cb 100644
--- a/mlpf/tfmodel/utils.py
+++ b/mlpf/tfmodel/utils.py
@@ -221,7 +221,7 @@ def get_strategy(num_cpus=None):
     num_batches_multiplier = 1
     if num_gpus > 1:
         num_batches_multiplier = num_gpus
-        logging.info("Multiple GPUs detected, num_batces_multiplier={}".format(num_batches_multiplier))
+        logging.info("Multiple GPUs detected, num_batches_multiplier={}".format(num_batches_multiplier))
 
     return strategy, num_gpus, num_batches_multiplier
 
@@ -368,7 +368,7 @@ def load_and_interleave(
 ):
     datasets = [mlpf_dataset_from_config(ds_name, config, split, max_events) for ds_name in dataset_names]
     ds = interleave_datasets(joint_dataset_name, split, datasets)
-    tensorflow_dataset = ds.tensorflow_dataset.map(get_map_to_supervised(config))
+    tensorflow_dataset = ds.tensorflow_dataset.map(get_map_to_supervised(config), num_parallel_calls=tf.data.AUTOTUNE)
 
     # use dynamic batching depending on the sequence length
     if config["batching"]["bucket_by_sequence_length"]:
diff --git a/requirements.txt b/requirements.txt
index b3d2f1257..217fec984 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,7 @@ networkx
 nevergrad
 notebook
 numba
+numpy==1.23.5  # later versions are incompatible with tf2onnx v1.14.0 (latest as of this commit)
 onnxruntime
 pandas
 papermill