Skip to content

Commit

Permalink
Added todi scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
TJ-Solergibert committed Jul 30, 2024
1 parent a8f979d commit c026422
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 10 deletions.
18 changes: 9 additions & 9 deletions examples/config_llama8b_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: SFT
project: SFT-Todi
run: Llama3-8B
seed: 42
step: null
Expand All @@ -33,25 +33,25 @@ model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
path: /mloscratch/homes/solergib/converter/nanotron/nanotron_checkpoints/NanotronLlama38B
path: /store/swissai/a06/models/nanotron_checkpoints/Meta-Llama-3.1-8B-Instruct
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
bos_token_id: 128000
eos_token_id: 128001
hidden_act: silu
hidden_size: 4096
initializer_range: 0.02
intermediate_size: 14336
is_llama_config: true
max_position_embeddings: 4096
max_position_embeddings: 131072
num_hidden_layers: 32
num_attention_heads: 32
num_key_value_heads: 8
pad_token_id: null
pretraining_tp: 1
rope_interleaved: false
rope_theta: 500000.0
rms_norm_eps: 1.0e-06
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: false
use_cache: true
Expand All @@ -76,7 +76,7 @@ optimizer:
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 1
dp: 4
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
Expand All @@ -86,13 +86,13 @@ parallelism:
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
tokenizer_name_or_path: /store/swissai/a06/models/nanotron_checkpoints/Meta-Llama-3.1-8B-Instruct
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 1
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 3
micro_batch_size: 4
sequence_length: 4096
train_steps: 250
val_check_interval: -1
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
"safetensors",
"dacite",
"tqdm",
"wandb",
]

[tool.setuptools.packages.find]
Expand Down
2 changes: 1 addition & 1 deletion src/nanotron/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def __init__(
self.micro_batch_size * self.n_micro_batches_per_batch * self.parallel_context.dp_pg.size()
)
log_rank(
f"Flattening Batch dimension for SFT training. global_batch_size:{self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}",
f"Flattening Batch dimension for SFT training. global_batch_size: {self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}",
logger=logger,
level=logging.INFO,
rank=0,
Expand Down
15 changes: 15 additions & 0 deletions tools/todi/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM nvcr.io/nvidia/pytorch:24.05-py3

# Setup
RUN apt-get update && apt-get install python3-pip python3-venv -y
RUN pip install --upgrade pip setuptools==69.5.1

RUN pip install flash-attn==2.5.8 --no-build-isolation

COPY nanotron/ /workspace/nanotron
WORKDIR /workspace/nanotron
RUN pip install -e '.[nanosets]'

# Instructions:
# 1. Build image: podman build -f /users/asolergi/SFT/nanotron/tools/todi/Dockerfile -t nanotron_sft /users/asolergi/SFT/ #### NOTE In /users/asolergi/SFT/ we have nanotron/ (/users/asolergi/SFT/nanotron)
# 2. Export image: enroot import -o /store/swissai/a06/.sft_toni/nanotron_sft.sqsh podman://localhost/nanotron_sft:latest
15 changes: 15 additions & 0 deletions tools/todi/nanotron_sft.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
image = "/store/swissai/a06/.sft_toni/nanotron_sft.sqsh"
mounts = [
"/capstor",
"/users",
"/store",
]
workdir = "/workspace/nanotron/"

[env]
FI_CXI_DISABLE_HOST_REGISTER = "1"
FI_MR_CACHE_MONITOR = "userfaultfd"

[annotations.com.hooks]
aws_ofi_nccl.enabled = "true"
aws_ofi_nccl.variant = "cuda12"
71 changes: 71 additions & 0 deletions tools/todi/submit_nanotron_sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash

#SBATCH --job-name nanotron_sft
#SBATCH --chdir /users/asolergi/SFT/nanotron # TODO Set this path!!!
#SBATCH --output reports/R-%x.%j.out # Make sure this paths exists, otherwise the job will fail silently
#SBATCH --error reports/R-%x.%j.err # Make sure this paths exists, otherwise the job will fail silently
#SBATCH --nodes 4 # number of Nodes
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
#SBATCH --gres gpu:4 # Number of GPUs
#SBATCH --cpus-per-task 288 # number of CPUs per task.
#SBATCH --time 11:59:59 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
#SBATCH --reservation todi
#SBATCH --environment /store/swissai/a06/.sft_toni/nanotron_sft.toml
#SBATCH --contiguous

echo "START TIME: $(date)"

# auto-fail on any errors in this script
set -eo pipefail

# logging script's variables/commands for future debug needs
set -x

######################
### Set environment ###
######################
GPUS_PER_NODE=4
echo "NODES: $SLURM_NNODES"
######################

######################
#### Set network #####
######################
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
######################

# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get
# 0 and the launcher will hang
#
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
LAUNCHER="torchrun \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $SLURM_NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
--node_rank ${SLURM_PROCID} \
"

PYTHON_FILE=/workspace/nanotron/run_train.py
NANOTRON_CONFIG=/users/asolergi/SFT/nanotron/examples/config_llama8b_sft.yaml # TODO Set this path!!!

export CMD="CUDA_DEVICE_MAX_CONNECTIONS=1 $LAUNCHER $PYTHON_FILE --config $NANOTRON_CONFIG"

echo $CMD

# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
SRUN_ARGS=" \
--cpus-per-task $SLURM_CPUS_PER_TASK \
--jobid $SLURM_JOB_ID \
--wait 60 \
--unbuffered \
"

# bash -c is needed for the delayed interpolation of env vars to work
srun $SRUN_ARGS bash -c "$CMD"

echo "END TIME: $(date)"

0 comments on commit c026422

Please sign in to comment.