BaguaSys · NOBLES5E · Sep 28, 2021 · Sep 17, 2021 · Sep 17, 2021 · Sep 18, 2021
@@ -5,9 +5,24 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
 
 set -euox pipefail
 
+# 0. install bagua
 cp -a /upstream /workdir
+export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
 
-CHECK_RESULT=()
+
+# 1. test communication_primitives api
+echo "begin to test [communication_primitives]"
+COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
+python -m bagua.distributed.launch \
+    --nnodes=2 \
+    --nproc_per_node 4 \
+    --node_rank=0 \
+    --master_addr="10.158.66.134" \
+    --master_port=1234 \
+    ${COMMUNICATION_SCRIPT}
+
+
+# 2. benchmark test with all communication algorithms
 function check_benchmark_log {
     logfile=$1
     algorithm=$2
@@ -35,18 +50,7 @@ function check_benchmark_log {
     fi
 }
 
-export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
-
-echo "begin to test [communication_primitives]"
-COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
-python -m bagua.distributed.launch \
-    --nnodes=2 \
-    --nproc_per_node 4 \
-    --node_rank=0 \
-    --master_addr="10.158.66.134" \
-    --master_port=1234 \
-    ${COMMUNICATION_SCRIPT}
-
+CHECK_RESULT=()
 SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
 algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized)
 speeds=(185.0 180.0 150.0 115.0 170 0)
@@ -81,3 +85,35 @@ if [ ${#CHECK_RESULT[*]} -gt 0 ]; then
   echo -e ${CHECK_RESULT[*]}
   exit 1
 fi
+
+# 3. test moe
+function check_moe_log {
+    logfile=$1
+    loss=$2
+
+    final_batch_loss=$(cat ${logfile} | grep "Loss" | tail -n 1 | awk '{print $NF}')
+
+    if [ $final_batch_loss == $loss ]; then
+        echo "Check moe success, final_batch_loss is equal."
+    else
+        result="Check moe fail, final_batch_loss["$final_batch_loss"] is not equal with "$loss"."
+        echo $result
+        exit 1
+    fi
+}
+
+MOE_SCRIPT="/workdir/examples/mnist/main.py"
+logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
+CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
+    --nnodes=2 \
+    --nproc_per_node 2 \
+    --node_rank=0 \
+    --master_addr="10.158.66.134" \
+    --master_port=1234 \
+    ${MOE_SCRIPT} \
+    --algorithm gradient_allreduce \
+    --epochs 5 \
+    --num-local-experts 2 \
+    --set-deterministic \
+    2>&1 | tee ${logfile}
+check_moe_log ${logfile} 0.000293
@@ -5,10 +5,12 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
 
 set -euox pipefail
 
+# 0. install bagua
 cp -a /upstream /workdir
-
 export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
 
+
+# 1. test communication_primitives api
 echo "begin to test [communication_primitives]"
 COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
 python -m bagua.distributed.launch \
@@ -19,6 +21,8 @@ python -m bagua.distributed.launch \
     --master_port=1234 \
     ${COMMUNICATION_SCRIPT}
 
+
+# 2. benchmark test with all communication algorithms
 SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
 algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized)
 length=${#algorithms[@]}
@@ -41,3 +45,19 @@ do
         --deterministic \
         2>&1 | tee ${logfile}
 done
+
+# 3. test moe
+MOE_SCRIPT="/workdir/examples/mnist/main.py"
+logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
+CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
+    --nnodes=2 \
+    --nproc_per_node 2 \
+    --node_rank=1 \
+    --master_addr="10.158.66.134" \
+    --master_port=1234 \
+    ${MOE_SCRIPT} \
+    --algorithm gradient_allreduce \
+    --epochs 5 \
+    --num-local-experts 2 \
+    --set-deterministic \
+    2>&1 | tee ${logfile}
@@ -57,3 +57,4 @@
 from . import contrib  # noqa: F401
 from . import communication  # noqa: F401
 from . import algorithms  # noqa: F401
+from . import moe  # noqa: E402,F401
@@ -58,6 +58,7 @@ def bagua_build_params(self) -> List[Tuple[str, torch.nn.Parameter]]:
                 for param_name, param in module.named_parameters(recurse=False)
                 if param.requires_grad
                 and f"{module_name}.{param_name}" not in self.parameters_to_ignore
+                and (not getattr(param, "expert", False))
             ]
         ]
 

diff --git a/bagua/torch_api/moe/__init__.py b/bagua/torch_api/moe/__init__.py
@@ -0,0 +1 @@
+from .layer import MoE  # noqa: F401
diff --git a/bagua/torch_api/moe/experts.py b/bagua/torch_api/moe/experts.py
@@ -0,0 +1,37 @@
+# The file has been adapted from DeepSpeed:
+#   https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py
+# Git commit hash: bff6126f0ddbd1a03da66867571ac87b11c21ac1
+# We retain the following license from the original files:
+
+# Copyright 2020 The Microsoft DeepSpeed Team
+
+import torch
+import copy
+
+
+class Experts(torch.nn.Module):
+    def __init__(self, expert, num_local_experts=1):
+        super(Experts, self).__init__()
+
+        self.deepspeed_experts = torch.nn.ModuleList(
+            [copy.deepcopy(expert) for i in range(num_local_experts)]
+        )
+        self.num_local_experts = num_local_experts
+
+        # TODO: revisit allreduce for moe.gate...
+        for expert in self.deepspeed_experts:
+            # TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group)
+            for name, param in expert.named_parameters():
+                param.allreduce = False
+
+    def forward(self, inputs):
+        chunks = inputs.chunk(self.num_local_experts, dim=1)
+        expert_outputs = []
+        for chunk, expert in zip(chunks, self.deepspeed_experts):
+            out = expert(chunk)
+            if type(out) is tuple:
+                out = out[0]  # Ignore the bias term for now
+            expert_outputs += [out]
+
+        expert_output = torch.cat(expert_outputs, dim=1)
+        return expert_output
diff --git a/bagua/torch_api/moe/layer.py b/bagua/torch_api/moe/layer.py
@@ -0,0 +1,91 @@
+# The file has been adapted from DeepSpeed:
+#   https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/layer.py
+# Git commit hash: bff6126f0ddbd1a03da66867571ac87b11c21ac1
+# We retain the following license from the original files:
+
+# Copyright 2020 The Microsoft DeepSpeed Team
+
+import bagua.torch_api as bagua
+import logging
+import torch
+import torch.distributed as dist
+
+from .sharded_moe import MOELayer, TopKGate
+from .experts import Experts
+import typing
+
+
+class MoE(torch.nn.Module):
+    def __init__(self,
+                 hidden_size,
+                 expert,
+                 num_local_experts=1,
+                 k=1,
+                 output_dropout_prob=0.0,
+                 capacity_factor=1.,
+                 eval_capacity_factor=1.,
+                 min_capacity=4,
+                 noisy_gate_policy: typing.Optional[str] = None):
+        """Initialize an MoE layer.
+
+        Arguments:
+            hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
+
+            expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
+
+            num_local_experts (int, optional): default=1, number of local experts per gpu.
+
+            k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+
+            output_dropout_prob (float, optional): default=0.0, output dropout probability.
+
+            capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+
+            eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+
+            min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
+
+            noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
+        """
+
+        super(MoE, self).__init__()
+
+        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
+            'Unsupported noisy_gate_policy: ' + noisy_gate_policy
+
+        self.num_experts = num_local_experts * bagua.get_world_size()
+        logging.info(f'num_experts: {self.num_experts} | num_local_experts: {num_local_experts} | world_size: {bagua.get_world_size()}')
+
+        experts = Experts(expert, num_local_experts)
+        self.deepspeed_moe = MOELayer(TopKGate(hidden_size,
+                                               self.num_experts,
+                                               k,
+                                               capacity_factor,
+                                               eval_capacity_factor,
+                                               min_capacity,
+                                               noisy_gate_policy),
+                                      experts,
+                                      num_local_experts,
+                                      group=dist.group.WORLD)
+
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states, used_token=None):
+        """ MoE forward
+
+        Arguments:
+            hidden_states (Tensor): input to the layer
+            used_token (Tensor, optional): default: None, mask only used tokens
+
+        Returns:
+            A tuple including output, gate loss, and expert count.
+
+            * output (Tensor): output of the model
+
+            * l_aux (Tensor): gate loss value
+
+            * exp_counts (int): expert count
+        """
+        output = self.deepspeed_moe(hidden_states, used_token)
+        output = self.dropout(output)
+        return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts