From 7ad13f41060f01f47b136943745b4b71dd630be9 Mon Sep 17 00:00:00 2001 From: Eric Wulff <31319227+erwulff@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:18:56 +0200 Subject: [PATCH] Fix use of deprecated Ray Tune environment variable (#338) * chore: update raytune search space, utils and startscript * fix: raytune deprecated env var for storage_path Also add num samples to draw in HPO as cmd line arg * chore: update clic config file for jureap57 * feat: script to build python env from scratch * chore: update startscripts for raytrain and raytune * fix CMS model path for ACAT2022 * MLPF datasets v2.0.0: track pythia-level genjets, genmet in datasets; add per-particle ispu flag (#332) * generate ttbar nopu events * up * update postprocessing * small sample generation * v3_1 run * updates for CMSSE 14 generation * [skip ci] cleanup postprocessing * [skip ci] update pu gen * update postprocessing with new truth definition based only on caloparticles * remove pdb, switch genjet to energy * [skip ci] prepare for v3_3 * [skip ci] fix flag * added time and mem limits * pu files from scratch * 20240702_cptruthdef submission * ttbar nopu v2 * up * added genjet, genmet to clic postprocessing * remove delphes * update tests * add postprocessing jobs * update torch * update dataset version * propagate genjets, genmet * shared memory error * training on v2.0.0 for cms * fix occasional root file load bug * add jmenano * fix qq * clic training * up * CMS training instructions (#336) * CMS training instructions * Update pyg-clic.yaml * Update pyg-clic.yaml * fix: black formatting * Enable CI/CD test of HPO workflow * fix: typo in test script --------- Co-authored-by: Joosep Pata --- mlpf/pyg/training.py | 26 ++------ mlpf/pyg_pipeline.py | 1 + mlpf/raytune/pt_search_space.py | 57 +++++++--------- mlpf/raytune/utils.py | 16 ----- parameters/pytorch/pyg-clic.yaml | 6 +- scripts/jureca/build_pip_env.sh | 40 ++++++++++++ scripts/jureca/pt_raytrain.sh | 109 +++++++++++++++++++++++++++++++ scripts/jureca/pt_raytune.sh | 102 +++++++++++++++++++++++++++++ scripts/local_test_pyg.sh | 8 +-- 9 files changed, 290 insertions(+), 75 deletions(-) create mode 100644 scripts/jureca/build_pip_env.sh create mode 100644 scripts/jureca/pt_raytrain.sh create mode 100644 scripts/jureca/pt_raytune.sh diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 8419d3f9d..aa9e9d770 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -855,6 +855,8 @@ def train_ray_trial(config, args, outdir=None): if outdir is None: outdir = ray.train.get_context().get_trial_dir() + if not os.path.exists(outdir): + os.makedirs(outdir) use_cuda = args.gpus > 0 @@ -970,11 +972,6 @@ def run_ray_training(config, args, outdir): from ray import tune from ray.train.torch import TorchTrainer - # create ray cache for intermediate storage of trials - tmp_ray_cache = TemporaryDirectory() - os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = tmp_ray_cache.name - _logger.info(f"RAY_AIR_LOCAL_CACHE_DIR: {os.environ['RAY_AIR_LOCAL_CACHE_DIR']}") - if not args.local: ray.init(address="auto") @@ -1027,9 +1024,6 @@ def run_ray_training(config, args, outdir): _logger.info("Final val_reg_loss: {}".format(result.metrics["val_reg_loss"]), color="bold") # _logger.info("Final val_charge_loss: {}".format(result.metrics["val_charge_loss"]), color="bold") - # clean up ray cache - tmp_ray_cache.cleanup() - def set_searchspace_and_run_trial(search_space, config, args): import ray @@ -1066,25 +1060,21 @@ def run_hpo(config, args): from raytune.pt_search_space import raytune_num_samples, search_space from raytune.utils import get_raytune_schedule, get_raytune_search_alg - # create ray cache for intermediate storage of trials - tmp_ray_cache = TemporaryDirectory() - os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = tmp_ray_cache.name - _logger.info(f"RAY_AIR_LOCAL_CACHE_DIR: {os.environ['RAY_AIR_LOCAL_CACHE_DIR']}") + if args.raytune_num_samples: + raytune_num_samples = args.raytune_num_samples # noqa: F811 name = args.hpo # name of Ray Tune experiment directory os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1" # don't crash if a metric is missing if isinstance(config["raytune"]["local_dir"], type(None)): raise TypeError("Please specify a local_dir in the raytune section of the config file.") - trd = config["raytune"]["local_dir"] + "/tune_result_dir" - os.environ["TUNE_RESULT_DIR"] = trd expdir = Path(config["raytune"]["local_dir"]) / name expdir.mkdir(parents=True, exist_ok=True) dirname = Path(config["raytune"]["local_dir"]) / name shutil.copy( - "mlpf/raytune/search_space.py", - str(dirname / "search_space.py"), + "mlpf/raytune/pt_search_space.py", + str(dirname / "pt_search_space.py"), ) # Copy the search space definition file to the train dir for later reference # Save config for later reference. Note that saving happens after parameters are overwritten by cmd line args. with open((dirname / "config.yaml"), "w") as file: @@ -1095,7 +1085,6 @@ def run_hpo(config, args): ray.init( address=os.environ["ip_head"], _node_ip_address=os.environ["head_node_ip"], - # _temp_dir="/p/project/raise-ctp2/cern/tmp_ray", ) _logger.info("Done.") @@ -1158,6 +1147,3 @@ def run_hpo(config, args): logging.info("Total time of Tuner.fit(): {}".format(end - start)) logging.info("Best hyperparameters found according to {} were: {}".format(config["raytune"]["default_metric"], best_config)) - - # clean up ray cache - tmp_ray_cache.cleanup() diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index c68349c57..96a18d65c 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -63,6 +63,7 @@ parser.add_argument("--local", action="store_true", default=None, help="perform HPO locally, without a Ray cluster") parser.add_argument("--ray-cpus", type=int, default=None, help="CPUs per trial for HPO") parser.add_argument("--ray-gpus", type=int, default=None, help="GPUs per trial for HPO") +parser.add_argument("--raytune-num-samples", type=int, default=None, help="Number of samples to draw from search space") parser.add_argument("--comet", action="store_true", help="use comet ml logging") parser.add_argument("--comet-offline", action="store_true", help="save comet logs locally") parser.add_argument("--comet-step-freq", type=int, default=None, help="step frequency for saving comet metrics") diff --git a/mlpf/raytune/pt_search_space.py b/mlpf/raytune/pt_search_space.py index d73258464..62aaa5764 100644 --- a/mlpf/raytune/pt_search_space.py +++ b/mlpf/raytune/pt_search_space.py @@ -1,47 +1,44 @@ -from ray.tune import grid_search # grid_search, choice, loguniform, quniform +from ray.tune import choice # grid_search, choice, loguniform, quniform -raytune_num_samples = 1 # Number of random samples to draw from search space. Set to 1 for grid search. -samp = grid_search +raytune_num_samples = 400 # Number of random samples to draw from search space. Set to 1 for grid search. +samp = choice # gnn scan search_space = { # dataset parameters - "ntrain": samp([500]), + # "ntrain": samp([500]), # "ntest": samp([10000]), - "nvalid": samp([500]), - "num_epochs": samp([10]), + # "nvalid": samp([500]), + # "num_epochs": samp([10]), # optimizer parameters - "lr": samp([1e-4, 3e-4, 1e-3, 3e-3]), - "lr_schedule": samp(["onecycle"]), - "pct_start": samp([0.05]), - # "gpu_batch_multiplier": samp([1, 4, 8, 16]), + "lr": samp([1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3]), + # "lr_schedule": samp(["onecycle"]), + # "pct_start": samp([0.0, 0.05, 0.1]), + "gpu_batch_multiplier": samp([1, 4, 8, 16]), # "patience": samp([9999]), # model arch parameters - # "activation": samp(["elu", "relu", "relu6", "leakyrelu"]), - "conv_type": samp(["attention"]), # can be "gnn_lsh", "gravnet", "attention" + "activation": samp(["elu", "relu", "relu6", "leakyrelu"]), + # "conv_type": samp(["attention"]), # can be "gnn_lsh", "gravnet", "attention" # "embedding_dim": samp([32, 64, 128, 252, 512, 1024]), # "width": samp([32, 64, 128, 256, 512, 1024]), - # "num_convs": samp([1, 2, 3, 4, 5, 6]), + "num_convs": samp([1, 2, 3, 4, 5]), # "dropout": samp([0.0, 0.01, 0.1, 0.4]), - # only for gravnet - # "k": samp([8, 16, 32]), - # "propagate_dimensions": samp([8, 16, 32, 64, 128]), - # "space_dimensions": samp([4]), # only for gnn-lsh - # "bin_size": samp([160, 320, 640]), + # "bin_size": samp([80, 160, 320, 640]), # "max_num_bins": samp([200]), - # "distance_dim": samp([16, 32, 64, 128, 256]), + # "distance_dim": samp([128]), # "layernorm": samp([True, False]), - # "num_node_messages": samp([1, 2, 3, 4, 5]), - # "ffn_dist_hidden_dim": samp([16, 32, 64, 128, 256]), - # "ffn_dist_num_layers": samp([1, 2, 3, 4, 5, 6]), + # "num_node_messages": samp([2]), + # "ffn_dist_hidden_dim": samp([64]), + # "ffn_dist_num_layers": samp([3]), # mamba specific parameters # "d_state": samp([16]), # "d_conv": samp([4]), # "expand": samp([2]), # "num_heads": samp([2, 4, 6, 8, 10, 12]), # attention specifica parameters - "num_heads": samp([2, 4, 8, 16]), + "num_heads": samp([4, 8, 16, 32, 64]), + "head_dim": samp([4, 8, 16, 32, 64]), # "attention_type": samp(["flash"]), # flash, efficient, math } @@ -56,31 +53,27 @@ def set_hps_from_search_space(search_space, config): conv_type = search_space["conv_type"] config["conv_type"] = conv_type - common_varaible_names = ["embedding_dim", "width", "num_convs", "activation"] + common_varaible_names = ["num_convs", "activation"] if conv_type == "gnn_lsh" or conv_type == "gravnet" or conv_type == "attention": for var in common_varaible_names: if var in search_space.keys(): config["model"][conv_type][var] = search_space[var] - gravnet_variable_names = ["k", "propagate_dimensions", "space_dimensions"] - if conv_type == "gravnet": - for var in gravnet_variable_names: - if var in search_space.keys(): - config["model"][conv_type][var] = search_space[var] - - attention_variables = ["num_heads"] + attention_variables = ["head_dim", "num_heads"] if conv_type == "attention": for var in attention_variables: if var in search_space.keys(): config["model"][conv_type][var] = search_space[var] - mamba_variables = ["num_heads", "d_state", "d_conv", "expand"] + mamba_variables = ["width", "embedding_dim", "num_heads", "d_state", "d_conv", "expand"] if conv_type == "mamba": for var in mamba_variables: if var in search_space.keys(): config["model"][conv_type][var] = search_space[var] gnn_lsh_varaible_names = [ + "width", + "embedding_dim", "bin_size", "max_num_bins", "distance_dim", diff --git a/mlpf/raytune/utils.py b/mlpf/raytune/utils.py index 91ba51c7f..fdcc55da3 100644 --- a/mlpf/raytune/utils.py +++ b/mlpf/raytune/utils.py @@ -8,7 +8,6 @@ from ray.tune.search.bayesopt import BayesOptSearch from ray.tune.search.bohb import TuneBOHB from ray.tune.search.hyperopt import HyperOptSearch -from ray.tune.search.skopt import SkOptSearch # from ray.tune.search.hebo import HEBOSearch # HEBO is not yet supported @@ -50,21 +49,6 @@ def get_raytune_search_alg(raytune_cfg, seeds=False): n_initial_points=raytune_cfg["hyperopt"]["n_random_steps"], # points_to_evaluate=, ) - if raytune_cfg["search_alg"] == "scikit": - print("INFO: Using bayesian optimization from scikit-learn") - return SkOptSearch( - metric=raytune_cfg["default_metric"], - mode=raytune_cfg["default_mode"], - convert_to_python=True, - ) - # HEBO is not yet supported - # if (raytune_cfg["search_alg"] == "hebo") or (raytune_cfg["search_alg"] == "HEBO"): - # print("Using HEBOSearch") - # return HEBOSearch( - # metric=raytune_cfg["default_metric"], - # mode=raytune_cfg["default_mode"], - # # max_concurrent=8, - # ) else: print("INFO: Not using any Ray Tune search algorithm") return None diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index 1cc9479bc..93774b632 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -10,7 +10,7 @@ num_epochs: 100 patience: 20 lr: 0.0001 lr_schedule: cosinedecay # constant, cosinedecay, onecycle -conv_type: gnn_lsh +conv_type: gnn_lsh # gnn_lsh, attention, mamba, flashattention ntrain: ntest: nvalid: @@ -80,8 +80,8 @@ lr_schedule_config: pct_start: 0.3 raytune: - local_dir: # Note: please specify an absolute path - sched: asha # asha, hyperband + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband search_alg: # bayes, bohb, hyperopt, nevergrad, scikit default_metric: "val_loss" default_mode: "min" diff --git a/scripts/jureca/build_pip_env.sh b/scripts/jureca/build_pip_env.sh new file mode 100644 index 000000000..e1fc240fb --- /dev/null +++ b/scripts/jureca/build_pip_env.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# 2023-12-14 +# Author: E. Wulff + + +module --force purge +ml Stages/2024 GCC/12.3.0 Python/3.11.3 +ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024 + +jutil env activate -p jureap57 + +python3 -m venv ray_tune_env + +source ray_tune_env/bin/activate + +pip3 install --upgrade pip +pip3 install numpy<1.25 +pip3 install pandas<1.6.0dev0 +pip3 install scikit-learn +pip3 install matplotlib +pip3 install tqdm +pip3 install autopep8 +pip3 install mplhep +pip3 install awkward +pip3 install fastjet +pip3 install comet-ml +pip3 install tensorflow_datasets==4.9.3 +pip3 install torch torchvision +pip3 install hls4ml[profiling] +pip3 install torch_geometric +pip3 install ray[data,train,tune,serve] +pip3 install async_timeout +pip3 install numba +pip3 install hyperopt +pip3 install causal-conv1d==1.0.2 +pip3 install mamba-ssm +pip3 install comet-ml + +deactivate diff --git a/scripts/jureca/pt_raytrain.sh b/scripts/jureca/pt_raytrain.sh new file mode 100644 index 000000000..b0458e962 --- /dev/null +++ b/scripts/jureca/pt_raytrain.sh @@ -0,0 +1,109 @@ +#!/bin/sh + +#SBATCH --account=jureap57 +#SBATCH --partition=dc-gpu-devel +#SBATCH --time 2:00:00 +#SBATCH --nodes 1 +#SBATCH --tasks-per-node=1 +#SBATCH --gres=gpu:4 +#SBATCH --gpus-per-task=4 +#SBATCH --cpus-per-task=128 + +# Job name +#SBATCH -J raytrain + +# Output and error logs +#SBATCH -o logs_slurm/log_%x_%j.out +#SBATCH -e logs_slurm/log_%x_%j.err + +# Add jobscript to job output +echo "#################### Job submission script. #############################" +cat $0 +echo "################# End of job submission script. #########################" + + +module --force purge +ml Stages/2024 GCC/12.3.0 Python/3.11.3 +ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024 + +jutil env activate -p jureap57 + +source ray_tune_env/bin/activate + +echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" +echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" +echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" +echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" +echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" +echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" +echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" +echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" +echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" +echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" +echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +echo "DEBUG: SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" +echo "DEBUG: SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK" +echo "DEBUG: SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK" + +export CUDA_VISIBLE_DEVICES=0,1,2,3 +num_gpus=${SLURM_GPUS_PER_TASK} # gpus per compute node +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} # necessary on JURECA for Ray to work + +## Limit number of max pending trials +export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) + +## Disable Ray Usage Stats +export RAY_USAGE_STATS_DISABLE=1 + + +################# DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### +# if [ "$SLURM_JOB_NUM_NODES" -gt 1 ]; then +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} + +port=7639 + +export ip_head="$head_node"i:"$port" +export head_node_ip="$head_node"i + +echo "Starting HEAD at $head_node" +# apptainer exec --nv -B /p/project/jureap57/cern \ +# apptainer/images/jureca_torch2307.sif \ +srun --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node"i --port=$port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & +sleep 20 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + ray start --address "$head_node"i:"$port" --redis-password='5241580000000000' \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & + sleep 10 +done +echo All Ray workers started. +# fi +############################################################################################## + +echo 'Starting training.' +# when training with Ray Train, --gpus should be equal to toal number of GPUs across the Ray Cluster +# apptainer exec --nv -B /p/project/jureap57/cern/data/tensorflow_datasets,/p/project/jureap57/cern/particleflow \ +# apptainer/images/jureca_torch2307.sif \ +python3 -u $PWD/mlpf/pyg_pipeline.py --train --ray-train \ + --config $1 \ + --prefix $2 \ + --ray-cpus $((SLURM_CPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \ + --gpus $((SLURM_GPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \ + --gpu-batch-multiplier 8 \ + --num-workers 8 \ + --prefetch-factor 8 \ + --experiments-dir /p/project/jureap57/cern/particleflow/experiments \ + --local \ + --ntrain 50000 + +echo 'Training done.' diff --git a/scripts/jureca/pt_raytune.sh b/scripts/jureca/pt_raytune.sh new file mode 100644 index 000000000..f876d7bff --- /dev/null +++ b/scripts/jureca/pt_raytune.sh @@ -0,0 +1,102 @@ +#!/bin/sh + +#SBATCH --account=jureap57 +#SBATCH --partition=dc-gpu-devel +#SBATCH --time 0:20:00 +#SBATCH --nodes=2 +#SBATCH --tasks-per-node=1 +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=128 +#SBATCH --gpus-per-task=4 +#SBATCH --exclusive + +# Job name +#SBATCH -J raytune + +# Output and error logs +#SBATCH -o logs_slurm/log_%x_%j.out +#SBATCH -e logs_slurm/log_%x_%j.err + +# Add jobscript to job output +echo "#################### Job submission script. #############################" +cat $0 +echo "################# End of job submission script. #########################" + + +module --force purge +ml Stages/2024 GCC/12.3.0 Python/3.11.3 +ml CUDA/12 cuDNN/8.9.5.29-CUDA-12 NCCL/default-CUDA-12 Apptainer-Tools/2024 + +jutil env activate -p jureap57 + +source ray_tune_env/bin/activate + +echo "Python used:" +which python3 +python3 --version + +sleep 1 +# make sure CUDA devices are visible +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} # necessary on JURECA for Ray to work + +num_gpus=4 + +## Limit number of max pending trials +export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) + +## Disable Ray Usage Stats +export RAY_USAGE_STATS_DISABLE=1 + + +################# DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} + +port=7639 + +export ip_head="$head_node"i:"$port" +export head_node_ip="$head_node"i + +echo "Starting HEAD at $head_node" +srun --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node"i --port=$port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + ray start --address "$head_node"i:"$port" --redis-password='5241580000000000' \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & + sleep 5 +done +echo All Ray workers started. +############################################################################################## + +# echo "Starting test..." +# python3 -u $PWD/mlpf/raytune/rayinit.py +# echo "Exited test." + +echo 'Starting HPO.' +# when training with Ray Train, --gpus should be equal to toal number of GPUs across the Ray Cluster +python3 -u $PWD/mlpf/pyg_pipeline.py --train \ + --data-dir /p/project/jureap57/cern/tensorflow_datasets/clusters \ + --config $1 \ + --hpo $2 \ + --ray-cpus 64 \ + --gpus $num_gpus \ + --num-workers 8 \ + --prefetch-factor 8 \ + --gpu-batch-multiplier 8 \ + --num-epochs 2 \ + --ntrain 5000 \ + --nvalid 5000 \ + --raytune-num-samples 2 + +echo 'HPO done.' diff --git a/scripts/local_test_pyg.sh b/scripts/local_test_pyg.sh index 2a0b9997b..00fd3cbd0 100755 --- a/scripts/local_test_pyg.sh +++ b/scripts/local_test_pyg.sh @@ -33,7 +33,7 @@ python mlpf/pyg_pipeline.py --config parameters/pytorch/pyg-cms.yaml --dataset c --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --test --make-plots --conv-type attention \ --export-onnx --pipeline --dtype float32 --attention-type math --num-convs 1 -#test Ray Train training -# python mlpf/pyg_pipeline.py --config parameters/pytorch/pyg-cms.yaml --dataset cms --data-dir ${PWD}/tensorflow_datasets/ \ -# --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --ray-train --ray-cpus 2 --local --conv-type attention \ -# --pipeline --dtype float32 --attention-type math --num-convs 1 --experiments-dir ${PWD}/experiments +# test Ray Train training +python mlpf/pyg_pipeline.py --config parameters/pytorch/pyg-cms.yaml --dataset cms --data-dir ${PWD}/tensorflow_datasets/ \ + --prefix MLPF_test_ --num-epochs 2 --nvalid 1 --gpus 0 --train --ray-train --ray-cpus 2 --local --conv-type attention \ + --pipeline --dtype float32 --attention-type math --num-convs 1 --experiments-dir ${PWD}/experiments