Skip to content

Commit

Permalink
deepspeed moe
Browse files Browse the repository at this point in the history
  • Loading branch information
liuhatry committed Sep 23, 2021
1 parent 8160021 commit 28bc3e2
Show file tree
Hide file tree
Showing 7 changed files with 510 additions and 239 deletions.
7 changes: 1 addition & 6 deletions bagua/torch_api/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
from typing import List

from .moe_layer import MOELayer # noqa: F401
from .top2gate import Top2Gate # noqa: F401

__all__: List[str] = []
from .layer import MoE
36 changes: 36 additions & 0 deletions bagua/torch_api/moe/experts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# The file has been adapted from DeepSpeed:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py
# Git commit hash: bff6126f0ddbd1a03da66867571ac87b11c21ac1
# We retain the following license from the original files:

# Copyright 2020 The Microsoft DeepSpeed Team

import torch
import copy


class Experts(torch.nn.Module):
def __init__(self, expert, num_local_experts=1):
super(Experts, self).__init__()

self.deepspeed_experts = torch.nn.ModuleList(
[copy.deepcopy(expert) for i in range(num_local_experts)])
self.num_local_experts = num_local_experts

# TODO: revisit allreduce for moe.gate...
for expert in self.deepspeed_experts:
# TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group)
for name, param in expert.named_parameters():
param.allreduce = False

def forward(self, inputs):
chunks = inputs.chunk(self.num_local_experts, dim=1)
expert_outputs = []
for chunk, expert in zip(chunks, self.deepspeed_experts):
out = expert(chunk)
if type(out) is tuple:
out = out[0] # Ignore the bias term for now
expert_outputs += [out]

expert_output = torch.cat(expert_outputs, dim=1)
return expert_output
96 changes: 96 additions & 0 deletions bagua/torch_api/moe/layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# The file has been adapted from DeepSpeed:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/layer.py
# Git commit hash: bff6126f0ddbd1a03da66867571ac87b11c21ac1
# We retain the following license from the original files:

# Copyright 2020 The Microsoft DeepSpeed Team

import bagua.torch_api as bagua
import logging
import torch.nn.init as init
import torch
import torch.distributed as dist

#from deepspeed.utils import logger, log_dist

#import deepspeed.utils.groups as groups
from .sharded_moe import MOELayer, TopKGate
from .experts import Experts
import copy
import typing


class MoE(torch.nn.Module):
def __init__(self,
hidden_size,
expert,
num_local_experts=1,
k=1,
output_dropout_prob=0.0,
capacity_factor=1.,
eval_capacity_factor=1.,
min_capacity=4,
noisy_gate_policy: typing.Optional[str] = None):
"""Initialize an MoE layer.
Arguments:
hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
num_local_experts (int, optional): default=1, number of local experts per gpu.
k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
output_dropout_prob (float, optional): default=0.0, output dropout probability.
capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
"""

super(MoE, self).__init__()

assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
'Unsupported noisy_gate_policy: ' + noisy_gate_policy

self.num_experts = num_local_experts * bagua.get_world_size()
logging.info(f'num_experts: {self.num_experts} | num_local_experts: {num_local_experts} | world_size: {bagua.get_world_size()}')

experts = Experts(expert, num_local_experts)
self.deepspeed_moe = MOELayer(TopKGate(hidden_size,
self.num_experts,
k,
capacity_factor,
eval_capacity_factor,
min_capacity,
noisy_gate_policy),
experts,
num_local_experts,
group=dist.group.WORLD)

self.dropout = torch.nn.Dropout(output_dropout_prob)

def forward(self, hidden_states, used_token=None):
""" MoE forward
Arguments:
hidden_states (Tensor): input to the layer
used_token (Tensor, optional): default: None, mask only used tokens
Returns:
A tuple including output, gate loss, and expert count.
* output (Tensor): output of the model
* l_aux (Tensor): gate loss value
* exp_counts (int): expert count
"""
output = self.deepspeed_moe(hidden_states, used_token)
output = self.dropout(output)
return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts
96 changes: 0 additions & 96 deletions bagua/torch_api/moe/moe_layer.py

This file was deleted.

Loading

0 comments on commit 28bc3e2

Please sign in to comment.