-
Notifications
You must be signed in to change notification settings - Fork 2
/
llama2-7b_microadam_gsm8k.yaml
117 lines (103 loc) · 3.26 KB
/
llama2-7b_microadam_gsm8k.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
model_name_or_path: meta-llama/Llama-2-7b-hf
max_seq_len: 512
precision: amp_bf16
#data_local: # TODO
#data_remote: # If blank, files must be present in data_local
seed: 42
max_duration: 3ep
eval_interval: 1ep
eval_first: false
global_train_batch_size: 32
device_train_microbatch_size: 1
device_eval_batch_size: 1
model:
name: hf_causal_lm
use_flash_attention_2: true
pretrained: true
pretrained_model_name_or_path: ${model_name_or_path}
use_auth_token: true
master_weights_dtype: bf16 # new, to convert model to bf16
tokenizer:
name: ${model_name_or_path}
kwargs:
model_max_length: ${max_seq_len}
train_loader:
name: finetuning
dataset:
hf_name: gsm8k
split: train
hf_kwargs:
name: main
max_seq_len: ${max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
shuffle: true
drop_last: false
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
scheduler:
name: linear_decay_with_warmup
t_warmup: 20ba
alpha_f: 0.0
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: DEFAULT
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
lr_monitor: {}
speed_monitor:
window_size: 1
memory_monitor: {}
runtime_estimator: {}
hf_checkpointer: # added by ionut on eldar's suggestion
overwrite: true
huggingface_folder_name: ""
precision: bfloat16
save_folder: ${save_folder}
save_interval: 1dur
optimizer:
name: to-be-filled-on-script-call
defaults: # new: these are the default parameters for all optimizers
lr: 1e-4
betas:
- 0.9
- 0.999
weight_decay: 0.0
eps: 1.0e-8
decoupled_adamw: # will use defaults
microadam: # additional parameters for this particular optimizer
m: 10
quant_block_size: 100000
k_init: 0.01
adamw8b: # additional parameters for this particular optimizer
optim_bits: 8
loggers:
wandb: # added group to wandb
entity:
project: MicroAdam_llama2-7b_${task}
job_type: lr=${optimizer.defaults.lr}_wd=${optimizer.defaults.weight_decay}
name: seed=${seed}
group: # TODO
wandb_groups:
decoupled_adamw:
group: ${task}_${optimizer.name}_ep=${max_duration}_bs=${global_train_batch_size}
adamw8b:
group: ${task}_${optimizer.name}_ep=${max_duration}_bs=${global_train_batch_size}_bits=${optimizer.adamw8b.optim_bits}
microadam:
group: ${task}_${optimizer.name}_ep=${max_duration}_bs=${global_train_batch_size}_m=${optimizer.microadam.m}_k=${optimizer.microadam.k_init}_qbs=${optimizer.microadam.quant_block_size}
# Checkpoint to local filesystem or remote object store
save_interval: 1dur
save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
save_folder: /mnt/beegfs/alistgrp/imodoran/results/ # ${loggers.wandb.project}/${run_name}
run_name: # ${loggers.wandb.group}_${loggers.wandb.job_type}_${loggers.wandb.name}
task: # new, used for evaluation