-
Notifications
You must be signed in to change notification settings - Fork 83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(python): support moe #208
Changes from 27 commits
db9963f
d3651e1
47dc14b
258ff6f
ce8ffe6
755c674
076e8a6
d62347a
ebc9b30
f65d8af
ab7770e
acc26c9
6fda627
7867e5d
6a84c5e
2a6cb52
8379d5a
8160021
28bc3e2
edf7e2d
0971cca
0e67467
2c2cbd0
7399925
0a8b612
a1d7162
ab404ff
04658ce
0fc5ad1
e50fdfc
013a3c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .layer import MoE # noqa: F401 | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# The file has been adapted from DeepSpeed: | ||
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py | ||
# Git commit hash: bff6126f0ddbd1a03da66867571ac87b11c21ac1 | ||
# We retain the following license from the original files: | ||
|
||
# Copyright 2020 The Microsoft DeepSpeed Team | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add our license There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
import torch | ||
import copy | ||
|
||
|
||
class Experts(torch.nn.Module): | ||
def __init__(self, expert, num_local_experts=1): | ||
super(Experts, self).__init__() | ||
|
||
self.deepspeed_experts = torch.nn.ModuleList( | ||
[copy.deepcopy(expert) for i in range(num_local_experts)] | ||
) | ||
self.num_local_experts = num_local_experts | ||
|
||
# TODO: revisit allreduce for moe.gate... | ||
for expert in self.deepspeed_experts: | ||
# TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group) | ||
for name, param in expert.named_parameters(): | ||
param.allreduce = False | ||
|
||
def forward(self, inputs): | ||
chunks = inputs.chunk(self.num_local_experts, dim=1) | ||
expert_outputs = [] | ||
for chunk, expert in zip(chunks, self.deepspeed_experts): | ||
out = expert(chunk) | ||
if type(out) is tuple: | ||
out = out[0] # Ignore the bias term for now | ||
expert_outputs += [out] | ||
|
||
expert_output = torch.cat(expert_outputs, dim=1) | ||
return expert_output |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# The file has been adapted from DeepSpeed: | ||
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/layer.py | ||
# Git commit hash: bff6126f0ddbd1a03da66867571ac87b11c21ac1 | ||
# We retain the following license from the original files: | ||
|
||
# Copyright 2020 The Microsoft DeepSpeed Team | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add our license There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
import bagua.torch_api as bagua | ||
import logging | ||
import torch | ||
import torch.distributed as dist | ||
|
||
from .sharded_moe import MOELayer, TopKGate | ||
from .experts import Experts | ||
import typing | ||
|
||
|
||
class MoE(torch.nn.Module): | ||
def __init__(self, | ||
hidden_size, | ||
expert, | ||
num_local_experts=1, | ||
k=1, | ||
output_dropout_prob=0.0, | ||
capacity_factor=1., | ||
eval_capacity_factor=1., | ||
min_capacity=4, | ||
noisy_gate_policy: typing.Optional[str] = None): | ||
liuhatry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Initialize an MoE layer. | ||
|
||
Arguments: | ||
hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. | ||
|
||
expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). | ||
|
||
num_local_experts (int, optional): default=1, number of local experts per gpu. | ||
|
||
k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. | ||
|
||
output_dropout_prob (float, optional): default=0.0, output dropout probability. | ||
|
||
capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. | ||
|
||
eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. | ||
|
||
min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. | ||
|
||
noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. | ||
""" | ||
|
||
super(MoE, self).__init__() | ||
|
||
assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ | ||
'Unsupported noisy_gate_policy: ' + noisy_gate_policy | ||
liuhatry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self.num_experts = num_local_experts * bagua.get_world_size() | ||
logging.info(f'num_experts: {self.num_experts} | num_local_experts: {num_local_experts} | world_size: {bagua.get_world_size()}') | ||
liuhatry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
experts = Experts(expert, num_local_experts) | ||
self.deepspeed_moe = MOELayer(TopKGate(hidden_size, | ||
self.num_experts, | ||
k, | ||
capacity_factor, | ||
eval_capacity_factor, | ||
min_capacity, | ||
noisy_gate_policy), | ||
experts, | ||
num_local_experts, | ||
group=dist.group.WORLD) | ||
liuhatry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self.dropout = torch.nn.Dropout(output_dropout_prob) | ||
|
||
def forward(self, hidden_states, used_token=None): | ||
""" MoE forward | ||
liuhatry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Arguments: | ||
hidden_states (Tensor): input to the layer | ||
used_token (Tensor, optional): default: None, mask only used tokens | ||
|
||
Returns: | ||
A tuple including output, gate loss, and expert count. | ||
|
||
* output (Tensor): output of the model | ||
|
||
* l_aux (Tensor): gate loss value | ||
|
||
* exp_counts (int): expert count | ||
""" | ||
output = self.deepspeed_moe(hidden_states, used_token) | ||
output = self.dropout(output) | ||
return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move the whole moe directory to
bagua/torch_api/model_parallel/moe
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done