set fp16 to false if bf16, update bf16: auto in example YAMLs (#1122)…

… [skip ci] * set fp16 to false if bf16, update bf16: auto in example YAMLs * unset fp16 so that it fallsback properly if bf16 isn't available * Update README.md [skip-ci] Co-authored-by: NanoCode012 <kevinvong@rocketmail.com> * test that bf16 disables fp16 --------- Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
axolotl-ai-cloud · Jan 22, 2024 · 782b6a4 · 782b6a4
1 parent eaaeefc
commit 782b6a4
Show file tree

Hide file tree

Showing 38 changed files with 86 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -464,8 +464,8 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
   ```yaml
   load_in_4bit: true
   load_in_8bit: true
-  bf16: true # require >=ampere
-  fp16: true
+  bf16: auto # require >=ampere, auto will detect if your GPU supports this and choose automatically.
+  fp16: # leave empty to use fp16 when bf16 is 'auto'. set to false if you want to fallback to fp32
   tf32: true # require >=ampere
   bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
   float16: true # use instead of fp16 when you don't want AMP

diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml
@@ -53,8 +53,8 @@ lr_quadratic_warmup: true
 learning_rate: 0.000085
 train_on_inputs: true
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 
 gradient_checkpointing: false

diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
@@ -36,8 +36,8 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:

diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml
@@ -38,8 +38,8 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:

diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml
@@ -64,8 +64,8 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row

diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml
@@ -38,8 +38,8 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:

diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml
@@ -33,8 +33,8 @@ lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:

diff --git a/examples/jeopardy-bot/config.yml b/examples/jeopardy-bot/config.yml
@@ -31,7 +31,7 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:

diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
@@ -47,8 +47,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml
@@ -34,8 +34,8 @@ learning_rate: 5e-5
 train_on_inputs: false
 group_by_length: true
 
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 
 gradient_checkpointing: false

diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml
@@ -34,8 +34,8 @@ learning_rate: 0.000005
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml
@@ -63,8 +63,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
@@ -50,8 +50,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml
@@ -33,7 +33,7 @@ lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:

diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml
@@ -46,8 +46,8 @@ learning_rate: 0.000003
 
 train_on_inputs: false
 group_by_length: true
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 
 gradient_checkpointing:

diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml
@@ -46,8 +46,8 @@ learning_rate: 0.000003
 
 train_on_inputs: false
 group_by_length: true
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 
 gradient_checkpointing:

diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml
@@ -49,8 +49,8 @@ learning_rate: 1e-5
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 
 gradient_checkpointing: true

diff --git a/examples/pythia/lora.yml b/examples/pythia/lora.yml
@@ -27,7 +27,7 @@ num_epochs: 4
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:

diff --git a/examples/qwen/lora.yml b/examples/qwen/lora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: false

diff --git a/examples/qwen/qlora.yml b/examples/qwen/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: false

diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml
@@ -34,7 +34,7 @@ lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:

diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml
@@ -33,7 +33,7 @@ lr_scheduler:
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 gradient_checkpointing:
 early_stopping_patience:

diff --git a/examples/tiny-llama/lora.yml b/examples/tiny-llama/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/tiny-llama/pretrain.yml b/examples/tiny-llama/pretrain.yml
@@ -34,8 +34,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/tiny-llama/qlora.yml b/examples/tiny-llama/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002
 
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 
 gradient_checkpointing: true

diff --git a/examples/xgen-7b/xgen-7b-8k-qlora.yml b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -62,8 +62,8 @@ lr_scheduler: cosine
 learning_rate: 0.00002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row

diff --git a/examples/yi-34B-chat/qlora.yml b/examples/yi-34B-chat/qlora.yml
@@ -7,8 +7,8 @@ load_in_8bit: false
 load_in_4bit: true
 strict: false
 sequence_len: 1024
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 flash_attention: true
 special_tokens: