-
Notifications
You must be signed in to change notification settings - Fork 513
/
gpt2-arc-easy-cpu-streaming-dataset.yaml
87 lines (73 loc) · 1.6 KB
/
gpt2-arc-easy-cpu-streaming-dataset.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
variables:
global_seed: 17
max_seq_len: 512
data_local: ./my_data
data_remote: # If blank, files must be present in data_local
# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME
max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}
# Model
model:
name: hf_causal_lm
pretrained_model_name_or_path: gpt2
pretrained: true # false: only use the architecture; true: initialize with pretrained weights
# Tokenizer
tokenizer:
name: gpt2
kwargs:
model_max_length: ${variables.max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
############
streams:
my_data:
remote: ${variables.data_remote}
local: ${variables.data_local}
split: train
############
shuffle: true
max_seq_len: ${variables.max_seq_len}
decoder_only_format: true
drop_last: true
num_workers: 8
# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1
optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 1ep
eval_interval: 1
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 8
# System
seed: ${variables.global_seed}
device_eval_batch_size: 8
device_train_microbatch_size: 8
# device_train_microbatch_size: auto
precision: fp32
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}