Added todi scripts

TJ-Solergibert · Jul 30, 2024 · c026422 · c026422
1 parent a8f979d
commit c026422
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 10 deletions.
diff --git a/examples/config_llama8b_sft.yaml b/examples/config_llama8b_sft.yaml
@@ -20,7 +20,7 @@ general:
   benchmark_csv_path: null
   consumed_train_samples: null
   ignore_sanity_checks: true
-  project: SFT
+  project: SFT-Todi
   run: Llama3-8B
   seed: 42
   step: null
@@ -33,25 +33,25 @@ model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
   init_method:
-    path: /mloscratch/homes/solergib/converter/nanotron/nanotron_checkpoints/NanotronLlama38B
+    path: /store/swissai/a06/models/nanotron_checkpoints/Meta-Llama-3.1-8B-Instruct
   make_vocab_size_divisible_by: 1
   model_config:
-    bos_token_id: 1
-    eos_token_id: 2
+    bos_token_id: 128000
+    eos_token_id: 128001
     hidden_act: silu
     hidden_size: 4096
     initializer_range: 0.02
     intermediate_size: 14336
     is_llama_config: true
-    max_position_embeddings: 4096
+    max_position_embeddings: 131072
     num_hidden_layers: 32
     num_attention_heads: 32
     num_key_value_heads: 8
     pad_token_id: null
     pretraining_tp: 1
     rope_interleaved: false
     rope_theta: 500000.0
-    rms_norm_eps: 1.0e-06
+    rms_norm_eps: 1.0e-05
     rope_scaling: null
     tie_word_embeddings: false
     use_cache: true
@@ -76,7 +76,7 @@ optimizer:
   weight_decay: 0.01
   zero_stage: 0
 parallelism:
-  dp: 1
+  dp: 4
   expert_parallel_size: 1
   pp: 1
   pp_engine: 1f1b
@@ -86,13 +86,13 @@ parallelism:
 profiler: null
 tokenizer:
   tokenizer_max_length: null
-  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  tokenizer_name_or_path: /store/swissai/a06/models/nanotron_checkpoints/Meta-Llama-3.1-8B-Instruct
   tokenizer_revision: null
 tokens:
   batch_accumulation_per_replica: 1
   limit_test_batches: 0
   limit_val_batches: 0
-  micro_batch_size: 3
+  micro_batch_size: 4
   sequence_length: 4096
   train_steps: 250
   val_check_interval: -1
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     "safetensors",
     "dacite",
     "tqdm",
+    "wandb",
 ]
 
 [tool.setuptools.packages.find]

diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
@@ -263,7 +263,7 @@ def __init__(
                 self.micro_batch_size * self.n_micro_batches_per_batch * self.parallel_context.dp_pg.size()
             )
             log_rank(
-                f"Flattening Batch dimension for SFT training. global_batch_size:{self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}",
+                f"Flattening Batch dimension for SFT training. global_batch_size: {self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}",
                 logger=logger,
                 level=logging.INFO,
                 rank=0,

diff --git a/tools/todi/Dockerfile b/tools/todi/Dockerfile
@@ -0,0 +1,15 @@
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+# Setup
+RUN apt-get update && apt-get install python3-pip python3-venv -y
+RUN pip install --upgrade pip setuptools==69.5.1
+
+RUN pip install flash-attn==2.5.8 --no-build-isolation
+
+COPY nanotron/ /workspace/nanotron
+WORKDIR /workspace/nanotron
+RUN pip install -e '.[nanosets]'
+
+# Instructions:
+# 1. Build image: podman build -f /users/asolergi/SFT/nanotron/tools/todi/Dockerfile -t nanotron_sft /users/asolergi/SFT/ #### NOTE In /users/asolergi/SFT/ we have nanotron/ (/users/asolergi/SFT/nanotron)
+# 2. Export image: enroot import -o /store/swissai/a06/.sft_toni/nanotron_sft.sqsh podman://localhost/nanotron_sft:latest
diff --git a/tools/todi/nanotron_sft.toml b/tools/todi/nanotron_sft.toml
@@ -0,0 +1,15 @@
+image = "/store/swissai/a06/.sft_toni/nanotron_sft.sqsh"
+mounts = [
+"/capstor",
+"/users",
+"/store",
+]
+workdir = "/workspace/nanotron/"
+
+[env]
+FI_CXI_DISABLE_HOST_REGISTER = "1"
+FI_MR_CACHE_MONITOR = "userfaultfd"
+
+[annotations.com.hooks]
+aws_ofi_nccl.enabled = "true"
+aws_ofi_nccl.variant = "cuda12"
diff --git a/tools/todi/submit_nanotron_sft.sh b/tools/todi/submit_nanotron_sft.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+#SBATCH --job-name nanotron_sft
+#SBATCH --chdir /users/asolergi/SFT/nanotron # TODO Set this path!!!
+#SBATCH --output reports/R-%x.%j.out    # Make sure this paths exists, otherwise the job will fail silently
+#SBATCH --error reports/R-%x.%j.err     # Make sure this paths exists, otherwise the job will fail silently
+#SBATCH --nodes 4                       # number of Nodes
+#SBATCH --ntasks-per-node 1             # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
+#SBATCH --gres gpu:4                    # Number of GPUs
+#SBATCH --cpus-per-task 288             # number of CPUs per task.
+#SBATCH --time 11:59:59                 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
+#SBATCH --reservation todi
+#SBATCH --environment  /store/swissai/a06/.sft_toni/nanotron_sft.toml
+#SBATCH --contiguous
+
+echo "START TIME: $(date)"
+
+# auto-fail on any errors in this script
+set -eo pipefail
+
+# logging script's variables/commands for future debug needs
+set -x
+
+######################
+### Set environment ###
+######################
+GPUS_PER_NODE=4
+echo "NODES: $SLURM_NNODES"
+######################
+
+######################
+#### Set network #####
+######################
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+######################
+
+# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get
+# 0 and the launcher will hang
+#
+# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
+LAUNCHER="torchrun \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $SLURM_NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    --node_rank ${SLURM_PROCID} \
+    "
+
+PYTHON_FILE=/workspace/nanotron/run_train.py
+NANOTRON_CONFIG=/users/asolergi/SFT/nanotron/examples/config_llama8b_sft.yaml # TODO Set this path!!!
+
+export CMD="CUDA_DEVICE_MAX_CONNECTIONS=1 $LAUNCHER $PYTHON_FILE --config $NANOTRON_CONFIG"
+
+echo $CMD
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+SRUN_ARGS=" \
+    --cpus-per-task $SLURM_CPUS_PER_TASK \
+    --jobid $SLURM_JOB_ID \
+    --wait 60 \
+    --unbuffered \
+    "
+
+# bash -c is needed for the delayed interpolation of env vars to work
+srun $SRUN_ARGS bash -c "$CMD"
+
+echo "END TIME: $(date)"