Skip to content

Commit

Permalink
fix distributed devices (#612)
Browse files Browse the repository at this point in the history
* fix distributed devices

* Update distributed.py

* Update distributed.py
  • Loading branch information
maximegmd authored Sep 21, 2023
1 parent c1382e7 commit 2fe95cd
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions src/axolotl/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-n
value_scalar = fn()
if not is_distributed():
return [value_scalar]
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
value_tensor = torch.tensor(
value_scalar, device=torch.cuda.current_device()
).float()

if not is_main_process():
dist.gather(value_tensor, dst=0)
Expand Down Expand Up @@ -137,9 +139,13 @@ def compute_and_broadcast(fn): # pylint: disable=invalid-name
"""
if is_main_process():
value_scalar = fn()
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
value_tensor = torch.tensor(
value_scalar, device=torch.cuda.current_device()
).float()
else:
value_tensor = torch.tensor(0.0, device=dist.get_rank()) # Placeholder tensor
value_tensor = torch.tensor(
0.0, device=torch.cuda.current_device()
) # Placeholder tensor

# Broadcast the tensor to all processes.
barrier()
Expand All @@ -164,7 +170,9 @@ def gather_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
- A list of computed values from all ranks if on the gathering rank, otherwise None.
"""
value_scalar = fn()
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
value_tensor = torch.tensor(
value_scalar, device=torch.cuda.current_device()
).float()

# Placeholder tensor for gathering results
if is_main_process():
Expand Down

0 comments on commit 2fe95cd

Please sign in to comment.