Skip to content

Commit

Permalink
fix issue for single GPU training/inference. By always initializing p…
Browse files Browse the repository at this point in the history
…rocess group. Megvii-BaseDetection#1722
  • Loading branch information
rpehkone committed Apr 15, 2024
1 parent 68e0286 commit 85c22a4
Showing 1 changed file with 26 additions and 17 deletions.
43 changes: 26 additions & 17 deletions yolox/core/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,34 @@ def launch(
args (tuple): arguments passed to main_func
"""
world_size = num_machines * num_gpus_per_machine
if world_size <= 0:
raise ValueError('`world_size` should be positive, currently {}'.format(world_size))

# Even if `world_size == 1`, we have to initialize the process group,
# so the user code can use all the `torch.dist`` facilities. This
# makes the code uniform whether there is one or more processes.

if dist_url == "auto":
assert (
num_machines == 1
), "`dist_url=auto` cannot work with distributed training."
port = _find_free_port()
dist_url = f"tcp://127.0.0.1:{port}"

worker_args = (
main_func,
world_size,
num_gpus_per_machine,
machine_rank,
backend,
dist_url,
args,
)

if world_size > 1:
# https://github.com/pytorch/pytorch/pull/14391
# TODO prctl in spawned processes

if dist_url == "auto":
assert (
num_machines == 1
), "dist_url=auto cannot work with distributed training."
port = _find_free_port()
dist_url = f"tcp://127.0.0.1:{port}"

start_method = "spawn"
cache = vars(args[1]).get("cache", False)

Expand All @@ -82,20 +99,12 @@ def launch(
mp.start_processes(
_distributed_worker,
nprocs=num_gpus_per_machine,
args=(
main_func,
world_size,
num_gpus_per_machine,
machine_rank,
backend,
dist_url,
args,
),
args=worker_args,
daemon=False,
start_method=start_method,
)
else:
main_func(*args)
_distributed_worker(0, *worker_args)


def _distributed_worker(
Expand Down

0 comments on commit 85c22a4

Please sign in to comment.