feature: loss watchdog for terminating training runs that are failing

axolotl-ai-cloud · Nov 28, 2023 · 49c9cd7 · 49c9cd7
1 parent a48dbf6
commit 49c9cd7
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -694,6 +694,9 @@ max_steps:
 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 
+loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
+loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
+
 # Save model as safetensors (require safetensors package)
 save_safetensors:
 

diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
@@ -62,6 +62,9 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true
 
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
 warmup_steps: 10
 eval_steps: 0.05
 eval_table_size:

diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
@@ -25,6 +25,7 @@
 from axolotl.utils.callbacks import (
     EvalFirstStepCallback,
     GPUStatsCallback,
+    LossWatchDogCallback,
     SaveAxolotlConfigtoWandBCallback,
     SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
@@ -430,6 +431,9 @@ def get_callbacks(self):
                 SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
             )
 
+        if self.cfg.loss_watchdog_threshold is not None:
+            callbacks.append(LossWatchDogCallback(self.cfg))
+
         return callbacks
 
     def get_post_trainer_create_callbacks(self, trainer):

diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
@@ -124,6 +124,36 @@ def on_step_end(
         return control
 
 
+class LossWatchDogCallback(TrainerCallback):
+    """Callback to track loss and stop training if loss is too high"""
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.logged = False
+        self.violations = 0
+        self.threshold = cfg.loss_watchdog_threshold
+        self.patience = cfg.loss_watchdog_patience or 3
+
+    def on_step_end(
+        self,
+        _args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **_kwargs,
+    ):
+        if len(state.log_history) > 0 and "loss" in state.log_history[-1]:
+            if state.log_history[-1]["loss"] > self.threshold:
+                self.violations += 1
+                if self.violations >= self.patience:
+                    LOG.warning(
+                        "Loss is too high, stopping training (loss_watchdog_threshold)"
+                    )
+                    control.should_training_stop = True
+            else:
+                self.violations = 0
+        return control
+
+
 def bench_eval_callback_factory(trainer, tokenizer):
     accuracy = evaluate.load("accuracy")
     abcd_idx = [