From 2f1ee83ee2d8e934a96afc9cb1ddea625cd59ba4 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 12 Dec 2023 00:01:25 -0500 Subject: [PATCH] update per PR feedback --- README.md | 5 +++-- examples/cerebras/btlm-ft.yml | 2 +- examples/cerebras/qlora.yml | 2 +- examples/code-llama/13b/lora.yml | 2 +- examples/code-llama/13b/qlora.yml | 2 +- examples/code-llama/34b/lora.yml | 2 +- examples/code-llama/34b/qlora.yml | 2 +- examples/code-llama/7b/lora.yml | 2 +- examples/code-llama/7b/qlora.yml | 2 +- examples/gptj/qlora.yml | 2 +- examples/llama-2/fft_optimized.yml | 2 +- examples/llama-2/gptq-lora.yml | 2 +- examples/llama-2/lora.yml | 2 +- examples/llama-2/qlora.yml | 2 +- examples/llama-2/tiny-llama.yml | 2 +- examples/mistral/config.yml | 2 +- examples/mistral/mixtral.yml | 2 +- examples/mistral/qlora.yml | 2 +- examples/openllama-3b/config.yml | 2 +- examples/openllama-3b/lora.yml | 2 +- examples/openllama-3b/qlora.yml | 2 +- examples/phi/phi-ft.yml | 2 +- examples/phi/phi-qlora.yml | 2 +- examples/qwen/lora.yml | 2 +- examples/qwen/qlora.yml | 2 +- examples/replit-3b/config-lora.yml | 2 +- src/axolotl/utils/config.py | 20 ++++++++++++++++++-- 27 files changed, 46 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 3ef2495ec..7f092d308 100644 --- a/README.md +++ b/README.md @@ -691,10 +691,11 @@ warmup_ratio: 0.05 # cannot use with warmup_steps learning_rate: 0.00003 lr_quadratic_warmup: logging_steps: +eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps +evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps save_strategy: # Set to `no` to skip checkpoint saves save_steps: # Leave empty to save at each epoch -saves_per_epoch: # number of times per epoch to save a checkpoint -eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps +saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps save_total_limit: # Checkpoints saved at a time # Maximum number of iterations to train for. It precedes num_epochs which means that # if both are set, num_epochs will not be guaranteed. diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml index 34bf945c7..d0975214b 100644 --- a/examples/cerebras/btlm-ft.yml +++ b/examples/cerebras/btlm-ft.yml @@ -73,7 +73,7 @@ gptq_model_v1: warmup_steps: 32 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 save_total_limit: debug: diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml index ba7884851..03155c6c2 100644 --- a/examples/cerebras/qlora.yml +++ b/examples/cerebras/qlora.yml @@ -50,7 +50,7 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml index 1f13ee4c5..fc43ad14e 100644 --- a/examples/code-llama/13b/lora.yml +++ b/examples/code-llama/13b/lora.yml @@ -55,7 +55,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml index 9e6ee1303..06b9ac72f 100644 --- a/examples/code-llama/13b/qlora.yml +++ b/examples/code-llama/13b/qlora.yml @@ -57,7 +57,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml index dd720a3e9..c2f1d5ce1 100644 --- a/examples/code-llama/34b/lora.yml +++ b/examples/code-llama/34b/lora.yml @@ -55,7 +55,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml index 2b9c885c1..ad1e21675 100644 --- a/examples/code-llama/34b/qlora.yml +++ b/examples/code-llama/34b/qlora.yml @@ -57,7 +57,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml index b8d8cc7c3..630c8da6f 100644 --- a/examples/code-llama/7b/lora.yml +++ b/examples/code-llama/7b/lora.yml @@ -55,7 +55,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml index 482236b64..12462dcb7 100644 --- a/examples/code-llama/7b/qlora.yml +++ b/examples/code-llama/7b/qlora.yml @@ -57,7 +57,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml index 27725da06..700d10e67 100644 --- a/examples/gptj/qlora.yml +++ b/examples/gptj/qlora.yml @@ -47,7 +47,7 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 10 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml index bd9c383e7..5530283bf 100644 --- a/examples/llama-2/fft_optimized.yml +++ b/examples/llama-2/fft_optimized.yml @@ -60,7 +60,7 @@ flash_attn_fuse_mlp: true warmup_steps: 100 evals_per_epoch: 4 eval_table_size: -save_steps: +saves_per_epoch: 1 debug: deepspeed: #deepspeed/zero2.json # multi-gpu only weight_decay: 0.1 diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml index 3f3493e25..a3235c1fb 100644 --- a/examples/llama-2/gptq-lora.yml +++ b/examples/llama-2/gptq-lora.yml @@ -63,7 +63,7 @@ sdp_attention: flash_optimum: warmup_steps: 100 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index d9e7a4b14..afb7dcd06 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -57,7 +57,7 @@ warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index e6fce8ab3..d68882d6a 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -58,7 +58,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 eval_table_size: -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml index 8d10f1cee..c72db4e5b 100644 --- a/examples/llama-2/tiny-llama.yml +++ b/examples/llama-2/tiny-llama.yml @@ -56,7 +56,7 @@ flash_attention: true warmup_steps: 10 evals_per_epoch: 4 eval_table_size: -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index 7c76abdb6..1c37b05c1 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -49,7 +49,7 @@ warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml index ff951b135..6e080e226 100644 --- a/examples/mistral/mixtral.yml +++ b/examples/mistral/mixtral.yml @@ -70,7 +70,7 @@ warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: deepspeed/zero2.json weight_decay: 0.0 diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index bf11370c6..64b26f4fa 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -69,7 +69,7 @@ warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 7d63240be..0a404c79d 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -50,7 +50,7 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml index 3555fbb20..4fbb634f9 100644 --- a/examples/openllama-3b/lora.yml +++ b/examples/openllama-3b/lora.yml @@ -55,7 +55,7 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index c2a909e25..3d6218b30 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -49,7 +49,7 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml index e80b6eb5c..eaebd21ef 100644 --- a/examples/phi/phi-ft.yml +++ b/examples/phi/phi-ft.yml @@ -60,7 +60,7 @@ flash_attention: warmup_steps: 100 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml index b4230c420..691a83509 100644 --- a/examples/phi/phi-qlora.yml +++ b/examples/phi/phi-qlora.yml @@ -60,7 +60,7 @@ flash_attention: warmup_steps: 100 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/qwen/lora.yml b/examples/qwen/lora.yml index f50b436b8..0ad9fc0f1 100644 --- a/examples/qwen/lora.yml +++ b/examples/qwen/lora.yml @@ -59,7 +59,7 @@ warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/qwen/qlora.yml b/examples/qwen/qlora.yml index 5eb26debb..1ce0cbdc0 100644 --- a/examples/qwen/qlora.yml +++ b/examples/qwen/qlora.yml @@ -59,7 +59,7 @@ warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml index c6ecd16d5..01314acc1 100644 --- a/examples/replit-3b/config-lora.yml +++ b/examples/replit-3b/config-lora.yml @@ -46,7 +46,7 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0 diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py index 34ae99ce4..b04c207dd 100644 --- a/src/axolotl/utils/config.py +++ b/src/axolotl/utils/config.py @@ -78,9 +78,13 @@ def normalize_config(cfg): cfg.torch_dtype = torch.float32 if cfg.saves_per_epoch: - cfg.save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs) + save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs) + if save_steps < 1.0: # prevent saves on every step + cfg.save_steps = save_steps if cfg.evals_per_epoch: - cfg.eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs) + eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs) + if eval_steps < 1.0: # prevent evals on every step + cfg.eval_steps = eval_steps cfg.dataset_processes = cfg.dataset_processes or os.cpu_count() @@ -362,10 +366,22 @@ def validate_config(cfg): raise ValueError( "save_steps and saves_per_epoch are mutually exclusive and cannot be used together." ) + if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps": + raise ValueError( + "save_strategy must be empty or set to `steps` when used with saves_per_epoch." + ) if cfg.evals_per_epoch and cfg.eval_steps: raise ValueError( "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together." ) + if ( + cfg.evals_per_epoch + and cfg.evaluation_strategy + and cfg.evaluation_strategy != "steps" + ): + raise ValueError( + "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch." + ) if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps": raise ValueError( "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."