Merge pull request PaddlePaddle#9 from mthreads/kernels

Kernels
hanhaowen-mt · Jul 26, 2023 · 8d14d0b · 8d14d0b
2 parents ae56102 + 7e92cf7
commit 8d14d0b
Show file tree

Hide file tree

Showing 80 changed files with 740 additions and 105 deletions.
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -23,6 +23,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 
 namespace phi {
 
@@ -32,6 +35,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventCreate(&start_);
     hipEventCreate(&stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreate(&start_);
+    musaEventCreate(&stop_);
 #else
     cudaEventCreate(&start_);
     cudaEventCreate(&stop_);
@@ -46,6 +52,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
     hipEventDestroy(stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(start_);
+    musaEventDestroy(stop_);
 #else
     cudaEventDestroy(start_);
     cudaEventDestroy(stop_);
@@ -55,6 +64,8 @@ class GpuTimer {
   void Start(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(start_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(start_, stream);
 #else
     cudaEventRecord(start_, stream);
 #endif
@@ -63,6 +74,8 @@ class GpuTimer {
   void Stop(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(stop_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(stop_, stream);
 #else
     cudaEventRecord(stop_, stream);
 #endif
@@ -73,6 +86,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
     hipEventElapsedTime(&milliseconds, start_, stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventSynchronize(stop_);
+    musaEventElapsedTime(&milliseconds, start_, stop_);
 #else
     cudaEventSynchronize(stop_);
     cudaEventElapsedTime(&milliseconds, start_, stop_);

diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(coalesce_tensor,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -64,7 +64,7 @@ struct GeluGradFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto dx_data = dx.data();
       auto dout_data = dout.data();

diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -53,7 +53,7 @@ struct GeluFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto out_data = out.data();
       int n = std::min(x.size(), out.size());

diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
@@ -175,7 +175,7 @@ class Blas {
              T* c,
              const int* ldc) const;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void MatMulWithHead(const phi::DenseTensor& mat_a,
                       const MatDescriptor& dim_a,
@@ -303,7 +303,7 @@ class Blas {
                    int batchCount) const;
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                            CBLAS_TRANSPOSE transB,
@@ -445,7 +445,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template CSRMM<T>(args...);
   }
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
     Base()->template MatMulWithHead<T>(args...);
@@ -593,3 +593,6 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+// TODO
+#endif
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1452,7 +1452,7 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
@@ -1698,7 +1698,7 @@ void Blas<DeviceContext>::MatMul(const T *mat_a,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 // @{ Group Blas MKLML: MatMulWithHead
 /*
  * Multiple two matrixes with multiple heads

diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -24,6 +24,10 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <murand_kernel.h>
+#endif
 
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 
@@ -142,6 +146,10 @@ __global__ void VectorizedRandomGenerator(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -212,6 +220,10 @@ __global__ void VectorizedGeneratorMask(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -295,6 +307,11 @@ void DropoutFwGPUKernelDriver(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
@@ -430,6 +447,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
       cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif

diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
@@ -96,7 +96,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
         unsigned long long int matchmask =      // NOLINT
             __ballot(match_found_this_thread);  // NOLINT
         int first_remaining_peer = __ffsll(matchmask) - 1;
-#else
+#else // MUSA and CUDA
         // If and only if match_found_this_thread of the Nth thread is non-zero,
         // set the Nth bit of matchmask to 1.
         unsigned int matchmask =
@@ -112,7 +112,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
           while (matchmask) {
 #ifdef PADDLE_WITH_HIP
             first_remaining_peer = __ffsll(matchmask) - 1;
-#else
+#else // CUDA and MUSA
             first_remaining_peer = __ffs(matchmask) - 1;
 #endif
             my_s[threadIdx.x] +=
@@ -142,7 +142,7 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx,
 #ifdef PADDLE_WITH_HIP
   constexpr int kWarpSize = 64;
   constexpr int kBlockDimY = 16;
-#else
+#else // CUDA and MUSA
   constexpr int kWarpSize = 32;
   constexpr int kBlockDimY = 32;
 #endif

diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
@@ -104,7 +104,7 @@ inline bool use_cache(const int64_t* signal_size) {
   }
   return using_cache;
 }
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 inline bool use_cache(const int64_t* signal_size) { return true; }
 #endif
 
@@ -200,6 +200,11 @@ void exec_fft(const phi::GPUContext& ctx,
       phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data()));
 #endif
 
   // execution of fft plan

diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h
@@ -25,6 +25,8 @@
 #include "paddle/phi/kernels/funcs/cufft_util.h"
 #elif defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/funcs/hipfft_util.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/mufft_util.h"
 #endif
 
 namespace phi {

diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -1350,7 +1350,7 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout,
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else
+#else // CUDA and MUSA
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize);
@@ -1501,7 +1501,7 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
       sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
-#else
+#else // CUDA and MUSA
       // WARP_SHFL_XOR(sum_loss, mask);
       sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize);
       sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize);

diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -20,6 +20,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
 
 #include <algorithm>
 

diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -23,6 +23,9 @@
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#ifdef __MCC__
+//TODO
+#endif
 
 #include <algorithm>
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"

diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
@@ -60,6 +60,8 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
                                             context.template Alloc<T>(Y),
                                             MIOPEN_SOFTMAX_ACCURATE,
                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -117,6 +119,8 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
                                              context.template Alloc<T>(XGrad),
                                              MIOPEN_SOFTMAX_ACCURATE,
                                              MIOPEN_SOFTMAX_MODE_INSTANCE));
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);

diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -100,3 +100,6 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
 #if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
 #endif
+#if defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h"
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -91,6 +91,10 @@ __global__ void VectorizedDropoutForward(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murand_state_philox4x32_10 state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murand_state_philox4x32_10;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);

diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -22,14 +22,18 @@
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <murand_kernel.h>
+#endif
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
-#else
+#else // MUSA & CUDA
 #define WARP_SIZE 32
 #endif
 

diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -299,7 +299,7 @@ void HardSwishGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(relu_grad,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -186,7 +186,7 @@ PD_REGISTER_KERNEL(relu,
                    float,
                    double,
                    phi::dtype::float16) {}
-#else
+#else // CUDA & MUSA
 PD_REGISTER_KERNEL(relu,
                    GPU,
                    ALL_LAYOUT,

diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -87,6 +87,8 @@ void AllCloseKernel(const Context& dev_ctx,
   grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
   hipMemset(out_data, true, sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(out_data, true, sizeof(bool));
 #else
   cudaMemset(out_data, true, sizeof(bool));
 #endif