Skip to content

Commit

Permalink
Update NCCL_ASYNC_ERROR_HANDLING env variable (#3267)
Browse files Browse the repository at this point in the history
* Update env variable

* Update cli variables according to version

* precommit formatting

---------

Co-authored-by: Pau Riba <pau.riba@helsing.ai>
Co-authored-by: Pau Riba <priba@cvc.uab.cat>
  • Loading branch information
3 people authored May 13, 2024
1 parent e12bbb7 commit 986a394
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion composer/cli/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import psutil
import torch
from packaging import version

import composer
from composer.loggers.mosaicml_logger import (
Expand Down Expand Up @@ -317,6 +318,13 @@ def _launch_processes(
log.info('Starting distributed environment on local node for global_rank(%s-%s)', base_rank, base_rank + nproc - 1)
log.info('Distributed KV store: tcp://%s:%s', master_addr, master_port)

nccl_env_variable = {
(
'NCCL_ASYNC_ERROR_HANDLING' if version.parse(torch.__version__) < version.parse('2.2.0') else 'TORCH_NCCL_ASYNC_ERROR_HANDLING'
):
'1',
}

for local_rank in range(nproc):
global_rank = base_rank + local_rank
if command_mode and module_mode:
Expand All @@ -339,7 +347,7 @@ def _launch_processes(
MASTER_ADDR=master_addr,
MASTER_PORT=str(master_port),
PYTHONUNBUFFERED='1',
NCCL_ASYNC_ERROR_HANDLING='1',
**nccl_env_variable,
):
# Populate the distributed variables in all launcher args
for arg in training_script_args:
Expand Down

0 comments on commit 986a394

Please sign in to comment.