From a74bb8ef38a7cc0c005a0aec98ea02e239e95ae5 Mon Sep 17 00:00:00 2001 From: Joe Early Date: Thu, 13 Jun 2024 10:51:50 +0100 Subject: [PATCH] Check for 'CUDA error: out of memory' with auto-microbatching --- composer/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c680d1d3d7c..ba455cd78d2 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -307,7 +307,7 @@ def _get_initial_device_train_microbatch_size( def _is_cuda_oom(e: RuntimeError): """Determines if error is CUDA Out of Memory and if auto_microbatching is enabled.""" - if 'CUDA out of memory' in str(e): + if any(s in str(e) for s in ['CUDA out of memory', 'CUDA error: out of memory']): return True # With batch_norm, large batch sizes sometimes result in cuDNN instead of Cuda OOMs. if 'cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguous input.' in str(