update per PR feedback

axolotl-ai-cloud · Dec 12, 2023 · 5467f13 · 5467f13
1 parent 0c053ca
commit 5467f13
Show file tree

Hide file tree

Showing 27 changed files with 46 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -691,10 +691,11 @@ warmup_ratio: 0.05  # cannot use with warmup_steps
 learning_rate: 0.00003
 lr_quadratic_warmup:
 logging_steps:
+eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
 save_strategy: # Set to `no` to skip checkpoint saves
 save_steps: # Leave empty to save at each epoch
-saves_per_epoch: # number of times per epoch to save a checkpoint
-eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.

diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml
@@ -73,7 +73,7 @@ gptq_model_v1:
 
 warmup_steps: 32
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 save_total_limit:
 
 debug:

diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
@@ -50,7 +50,7 @@ gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
@@ -55,7 +55,7 @@ flash_attention: true
 
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
@@ -57,7 +57,7 @@ flash_attention: true
 
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
@@ -55,7 +55,7 @@ flash_attention: true
 
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
@@ -57,7 +57,7 @@ flash_attention: true
 
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
@@ -55,7 +55,7 @@ flash_attention: true
 
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
@@ -57,7 +57,7 @@ flash_attention: true
 
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml
@@ -47,7 +47,7 @@ gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml
@@ -60,7 +60,7 @@ flash_attn_fuse_mlp: true
 warmup_steps: 100
 evals_per_epoch: 4
 eval_table_size:
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed: #deepspeed/zero2.json # multi-gpu only
 weight_decay: 0.1

diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml
@@ -63,7 +63,7 @@ sdp_attention:
 flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
@@ -57,7 +57,7 @@ warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
@@ -58,7 +58,7 @@ flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml
@@ -56,7 +56,7 @@ flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml
@@ -49,7 +49,7 @@ warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml
@@ -70,7 +70,7 @@ warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed: deepspeed/zero2.json
 weight_decay: 0.0

diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
@@ -69,7 +69,7 @@ warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml
@@ -50,7 +50,7 @@ gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml
@@ -55,7 +55,7 @@ gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml
@@ -49,7 +49,7 @@ gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml
@@ -60,7 +60,7 @@ flash_attention:
 
 warmup_steps: 100
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml
@@ -60,7 +60,7 @@ flash_attention:
 
 warmup_steps: 100
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1

diff --git a/examples/qwen/lora.yml b/examples/qwen/lora.yml
@@ -59,7 +59,7 @@ warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/qwen/qlora.yml b/examples/qwen/qlora.yml
@@ -59,7 +59,7 @@ warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0

diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml
@@ -46,7 +46,7 @@ gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
-save_steps:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0

diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
@@ -78,9 +78,13 @@ def normalize_config(cfg):
         cfg.torch_dtype = torch.float32
 
     if cfg.saves_per_epoch:
-        cfg.save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
+        save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
+        if save_steps < 1.0:  # prevent saves on every step
+            cfg.save_steps = save_steps
     if cfg.evals_per_epoch:
-        cfg.eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
+        eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
+        if eval_steps < 1.0:  # prevent evals on every step
+            cfg.eval_steps = eval_steps
 
     cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
 
@@ -362,10 +366,22 @@ def validate_config(cfg):
         raise ValueError(
             "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
         )
+    if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps":
+        raise ValueError(
+            "save_strategy must be empty or set to `steps` when used with saves_per_epoch."
+        )
     if cfg.evals_per_epoch and cfg.eval_steps:
         raise ValueError(
             "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
         )
+    if (
+        cfg.evals_per_epoch
+        and cfg.evaluation_strategy
+        and cfg.evaluation_strategy != "steps"
+    ):
+        raise ValueError(
+            "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
+        )
     if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
         raise ValueError(
             "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."