-
Notifications
You must be signed in to change notification settings - Fork 513
/
mcli-llama2-finetune.yaml
156 lines (135 loc) · 4.15 KB
/
mcli-llama2-finetune.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.10.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
command: |
cd llm-foundry/scripts
composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.3.1_cu121-latest
name: llama2-finetune
compute:
# Note: Finetuning the 70b model requires at least 16x80GB GPUs
gpus: 8 # Number of GPUs to use
## These configurations are optional
# cluster: TODO # Name of the cluster to use for this run
# gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments
# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
tokenizer_name: meta-llama/Llama-2-7b-hf
max_seq_len: 4096
global_seed: 17
# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME
max_split_size_mb: 512
# Model
model:
name: hf_causal_lm
init_device: mixed
pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
pretrained: true
# Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
use_auth_token: true
use_flash_attention_2: true
# Tokenizer
tokenizer:
name: ${tokenizer_name}
kwargs:
model_max_length: ${max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: mosaicml/dolly_hhrlhf
split: train
max_seq_len: ${max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
shuffle: true
# # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
# # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
# # of the dataset.
# # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
# packing_ratio: auto
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
eval_loader:
name: finetuning
dataset:
hf_name: mosaicml/dolly_hhrlhf
split: test
max_seq_len: ${max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
# packing_ratio:
shuffle: false
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1
# Note: You may want to change learning rate, betas, weight decay
optimizer:
name: decoupled_lionw
lr: 5.0e-7
betas:
- 0.9
- 0.95
weight_decay: 0.0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 1ep
eval_first: false
eval_interval: 1ep
eval_subset_num_batches: -1
global_train_batch_size: 64
# System
seed: ${global_seed}
device_eval_batch_size: 8
device_train_microbatch_size: auto
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}
load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc
# loggers:
# wandb: {}
# Checkpoint to local filesystem or remote object store
# save_interval: 2000ba
# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
# Load from local filesystem or remote object store
# load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt
# load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt