forked from swiss-ai/nanotron
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a8f979d
commit c026422
Showing
6 changed files
with
112 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ dependencies = [ | |
"safetensors", | ||
"dacite", | ||
"tqdm", | ||
"wandb", | ||
] | ||
|
||
[tool.setuptools.packages.find] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM nvcr.io/nvidia/pytorch:24.05-py3 | ||
|
||
# Setup | ||
RUN apt-get update && apt-get install python3-pip python3-venv -y | ||
RUN pip install --upgrade pip setuptools==69.5.1 | ||
|
||
RUN pip install flash-attn==2.5.8 --no-build-isolation | ||
|
||
COPY nanotron/ /workspace/nanotron | ||
WORKDIR /workspace/nanotron | ||
RUN pip install -e '.[nanosets]' | ||
|
||
# Instructions: | ||
# 1. Build image: podman build -f /users/asolergi/SFT/nanotron/tools/todi/Dockerfile -t nanotron_sft /users/asolergi/SFT/ #### NOTE In /users/asolergi/SFT/ we have nanotron/ (/users/asolergi/SFT/nanotron) | ||
# 2. Export image: enroot import -o /store/swissai/a06/.sft_toni/nanotron_sft.sqsh podman://localhost/nanotron_sft:latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
image = "/store/swissai/a06/.sft_toni/nanotron_sft.sqsh" | ||
mounts = [ | ||
"/capstor", | ||
"/users", | ||
"/store", | ||
] | ||
workdir = "/workspace/nanotron/" | ||
|
||
[env] | ||
FI_CXI_DISABLE_HOST_REGISTER = "1" | ||
FI_MR_CACHE_MONITOR = "userfaultfd" | ||
|
||
[annotations.com.hooks] | ||
aws_ofi_nccl.enabled = "true" | ||
aws_ofi_nccl.variant = "cuda12" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name nanotron_sft | ||
#SBATCH --chdir /users/asolergi/SFT/nanotron # TODO Set this path!!! | ||
#SBATCH --output reports/R-%x.%j.out # Make sure this paths exists, otherwise the job will fail silently | ||
#SBATCH --error reports/R-%x.%j.err # Make sure this paths exists, otherwise the job will fail silently | ||
#SBATCH --nodes 4 # number of Nodes | ||
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task | ||
#SBATCH --gres gpu:4 # Number of GPUs | ||
#SBATCH --cpus-per-task 288 # number of CPUs per task. | ||
#SBATCH --time 11:59:59 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5 | ||
#SBATCH --reservation todi | ||
#SBATCH --environment /store/swissai/a06/.sft_toni/nanotron_sft.toml | ||
#SBATCH --contiguous | ||
|
||
echo "START TIME: $(date)" | ||
|
||
# auto-fail on any errors in this script | ||
set -eo pipefail | ||
|
||
# logging script's variables/commands for future debug needs | ||
set -x | ||
|
||
###################### | ||
### Set environment ### | ||
###################### | ||
GPUS_PER_NODE=4 | ||
echo "NODES: $SLURM_NNODES" | ||
###################### | ||
|
||
###################### | ||
#### Set network ##### | ||
###################### | ||
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | ||
MASTER_PORT=6000 | ||
###################### | ||
|
||
# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get | ||
# 0 and the launcher will hang | ||
# | ||
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time | ||
LAUNCHER="torchrun \ | ||
--nproc_per_node $GPUS_PER_NODE \ | ||
--nnodes $SLURM_NNODES \ | ||
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ | ||
--rdzv_backend c10d \ | ||
--max_restarts 0 \ | ||
--tee 3 \ | ||
--node_rank ${SLURM_PROCID} \ | ||
" | ||
|
||
PYTHON_FILE=/workspace/nanotron/run_train.py | ||
NANOTRON_CONFIG=/users/asolergi/SFT/nanotron/examples/config_llama8b_sft.yaml # TODO Set this path!!! | ||
|
||
export CMD="CUDA_DEVICE_MAX_CONNECTIONS=1 $LAUNCHER $PYTHON_FILE --config $NANOTRON_CONFIG" | ||
|
||
echo $CMD | ||
|
||
# srun error handling: | ||
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks | ||
SRUN_ARGS=" \ | ||
--cpus-per-task $SLURM_CPUS_PER_TASK \ | ||
--jobid $SLURM_JOB_ID \ | ||
--wait 60 \ | ||
--unbuffered \ | ||
" | ||
|
||
# bash -c is needed for the delayed interpolation of env vars to work | ||
srun $SRUN_ARGS bash -c "$CMD" | ||
|
||
echo "END TIME: $(date)" |