Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#9 from mthreads/kernels
Browse files Browse the repository at this point in the history
Kernels
  • Loading branch information
caizhi-mt authored and mt-robot committed Jul 26, 2023
2 parents ae56102 + 7e92cf7 commit 8d14d0b
Show file tree
Hide file tree
Showing 80 changed files with 740 additions and 105 deletions.
16 changes: 16 additions & 0 deletions paddle/phi/kernels/autotune/gpu_timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#ifdef PADDLE_WITH_MUSA
#include <musa_runtime.h>
#endif

namespace phi {

Expand All @@ -32,6 +35,9 @@ class GpuTimer {
#ifdef PADDLE_WITH_HIP
hipEventCreate(&start_);
hipEventCreate(&stop_);
#elif defined(PADDLE_WITH_MUSA)
musaEventCreate(&start_);
musaEventCreate(&stop_);
#else
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
Expand All @@ -46,6 +52,9 @@ class GpuTimer {
#ifdef PADDLE_WITH_HIP
hipEventDestroy(start_);
hipEventDestroy(stop_);
#elif defined(PADDLE_WITH_MUSA)
musaEventDestroy(start_);
musaEventDestroy(stop_);
#else
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
Expand All @@ -55,6 +64,8 @@ class GpuTimer {
void Start(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
hipEventRecord(start_, stream);
#elif defined(PADDLE_WITH_MUSA)
musaEventRecord(start_, stream);
#else
cudaEventRecord(start_, stream);
#endif
Expand All @@ -63,6 +74,8 @@ class GpuTimer {
void Stop(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
hipEventRecord(stop_, stream);
#elif defined(PADDLE_WITH_MUSA)
musaEventRecord(stop_, stream);
#else
cudaEventRecord(stop_, stream);
#endif
Expand All @@ -73,6 +86,9 @@ class GpuTimer {
#ifdef PADDLE_WITH_HIP
hipEventSynchronize(stop_);
hipEventElapsedTime(&milliseconds, start_, stop_);
#elif defined(PADDLE_WITH_MUSA)
musaEventSynchronize(stop_);
musaEventElapsedTime(&milliseconds, start_, stop_);
#else
cudaEventSynchronize(stop_);
cudaEventElapsedTime(&milliseconds, start_, stop_);
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/batch_norm_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
}
#endif
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
PD_REGISTER_KERNEL(batch_norm_infer,
GPU,
ALL_LAYOUT,
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/coalesce_tensor_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ PD_REGISTER_KERNEL(coalesce_tensor,
}
#endif

#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
PD_REGISTER_KERNEL(coalesce_tensor,
GPU,
ALL_LAYOUT,
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/cpu/gelu_grad_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct GeluGradFunctor {
} else {
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_HIP)
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
auto x_data = x.data();
auto dx_data = dx.data();
auto dout_data = dout.data();
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/cpu/gelu_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct GeluFunctor {
} else {
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_HIP)
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
auto x_data = x.data();
auto out_data = out.data();
int n = std::min(x.size(), out.size());
Expand Down
9 changes: 6 additions & 3 deletions paddle/phi/kernels/funcs/blas/blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class Blas {
T* c,
const int* ldc) const;

#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
template <typename T>
void MatMulWithHead(const phi::DenseTensor& mat_a,
const MatDescriptor& dim_a,
Expand Down Expand Up @@ -303,7 +303,7 @@ class Blas {
int batchCount) const;

#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_HIP)
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
template <typename T>
void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
Expand Down Expand Up @@ -445,7 +445,7 @@ class BlasT : private Blas<DeviceContext> {
Base()->template CSRMM<T>(args...);
}

#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
template <typename... ARGS>
void MatMulWithHead(ARGS... args) const {
Base()->template MatMulWithHead<T>(args...);
Expand Down Expand Up @@ -593,3 +593,6 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
#ifdef PADDLE_WITH_HIP
#include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
#endif
#ifdef PADDLE_WITH_MUSA
// TODO
#endif
4 changes: 2 additions & 2 deletions paddle/phi/kernels/funcs/blas/blas_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1452,7 +1452,7 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}

#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) // @{ Group Blas MKLML: BatchedGEMMWithHead
template <>
template <typename T>
void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
Expand Down Expand Up @@ -1698,7 +1698,7 @@ void Blas<DeviceContext>::MatMul(const T *mat_a,
}

#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
!defined(PADDLE_WITH_HIP)
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
// @{ Group Blas MKLML: MatMulWithHead
/*
* Multiple two matrixes with multiple heads
Expand Down
19 changes: 19 additions & 0 deletions paddle/phi/kernels/funcs/dropout_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ limitations under the License. */
#include <hip/hip_runtime.h>
#include <hiprand_kernel.h>
#endif
#ifdef PADDLE_WITH_MUSA
#include <musa_runtime.h>
#include <murand_kernel.h>
#endif

#include "paddle/phi/kernels/funcs/dropout_impl_util.h"

Expand Down Expand Up @@ -142,6 +146,10 @@ __global__ void VectorizedRandomGenerator(const size_t n,
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = hiprandStatePhilox4_32_10_t;
#elif defined(PADDLE_WITH_MUSA)
murand_state_philox4x32_10 state;
murand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = murand_state_philox4x32_10;
#else
curandStatePhilox4_32_10_t state;
curand_init(seed, idx + THREAD_ID_X, increment, &state);
Expand Down Expand Up @@ -212,6 +220,10 @@ __global__ void VectorizedGeneratorMask(const size_t n,
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = hiprandStatePhilox4_32_10_t;
#elif defined(PADDLE_WITH_MUSA)
murand_state_philox4x32_10 state;
murand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = murand_state_philox4x32_10;
#else
curandStatePhilox4_32_10_t state;
curand_init(seed, idx + THREAD_ID_X, increment, &state);
Expand Down Expand Up @@ -295,6 +307,11 @@ void DropoutFwGPUKernelDriver(
hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
#elif defined(PADDLE_WITH_MUSA)
PADDLE_ENFORCE_GPU_SUCCESS(
musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
PADDLE_ENFORCE_GPU_SUCCESS(
musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
Expand Down Expand Up @@ -430,6 +447,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
if (upscale_in_train && dropout_prob == 1.0f) {
#ifdef PADDLE_WITH_HIP
hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
#elif defined(PADDLE_WITH_MUSA)
musaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
#else
cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
#endif
Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/kernels/funcs/embedding_grad.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
unsigned long long int matchmask = // NOLINT
__ballot(match_found_this_thread); // NOLINT
int first_remaining_peer = __ffsll(matchmask) - 1;
#else
#else // MUSA and CUDA
// If and only if match_found_this_thread of the Nth thread is non-zero,
// set the Nth bit of matchmask to 1.
unsigned int matchmask =
Expand All @@ -112,7 +112,7 @@ __global__ void EmbeddingGradDeterministicKernel(T* table,
while (matchmask) {
#ifdef PADDLE_WITH_HIP
first_remaining_peer = __ffsll(matchmask) - 1;
#else
#else // CUDA and MUSA
first_remaining_peer = __ffs(matchmask) - 1;
#endif
my_s[threadIdx.x] +=
Expand Down Expand Up @@ -142,7 +142,7 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& ctx,
#ifdef PADDLE_WITH_HIP
constexpr int kWarpSize = 64;
constexpr int kBlockDimY = 16;
#else
#else // CUDA and MUSA
constexpr int kWarpSize = 32;
constexpr int kBlockDimY = 32;
#endif
Expand Down
7 changes: 6 additions & 1 deletion paddle/phi/kernels/funcs/fft.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ inline bool use_cache(const int64_t* signal_size) {
}
return using_cache;
}
#elif defined(PADDLE_WITH_HIP)
#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
inline bool use_cache(const int64_t* signal_size) { return true; }
#endif

Expand Down Expand Up @@ -200,6 +200,11 @@ void exec_fft(const phi::GPUContext& ctx,
phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
#elif defined(PADDLE_WITH_MUSA)
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::mufftSetStream(config->plan(), ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data()));
#endif

// execution of fft plan
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/funcs/fft_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "paddle/phi/kernels/funcs/cufft_util.h"
#elif defined(PADDLE_WITH_HIP)
#include "paddle/phi/kernels/funcs/hipfft_util.h"
#elif defined(PADDLE_WITH_MUSA)
#include "paddle/phi/kernels/funcs/mufft_util.h"
#endif

namespace phi {
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/funcs/layer_norm_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,7 @@ __global__ void LayerNormBackwardComputeGradInput(const T *__restrict__ dout,
// WARP_SHFL_XOR(sum_loss, mask);
sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
#else
#else // CUDA and MUSA
// WARP_SHFL_XOR(sum_loss, mask);
sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, warpSize);
sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, warpSize);
Expand Down Expand Up @@ -1501,7 +1501,7 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
// WARP_SHFL_XOR(sum_loss, mask);
sum_loss1 += __shfl_xor(sum_loss1, mask, warpSize);
sum_loss2 += __shfl_xor(sum_loss2, mask, warpSize);
#else
#else // CUDA and MUSA
// WARP_SHFL_XOR(sum_loss, mask);
sum_loss1 += __shfl_xor_sync(0xffffffff, sum_loss1, mask, WarpSize);
sum_loss2 += __shfl_xor_sync(0xffffffff, sum_loss2, mask, WarpSize);
Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/kernels/funcs/math_cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_HIP
#include <hip/hip_fp16.h>
#endif
#ifdef PADDLE_WITH_MUSA
#include <musa_fp16.h>
#endif

#include <algorithm>

Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/kernels/funcs/select_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#ifdef __MCC__
//TODO
#endif

#include <algorithm>
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/kernels/funcs/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
context.template Alloc<T>(Y),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#elif defined(PADDLE_WITH_MUSA)
// TODO
#else
cudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
Expand Down Expand Up @@ -117,6 +119,8 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
context.template Alloc<T>(XGrad),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#elif defined(PADDLE_WITH_MUSA)
// TODO
#else
cudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims);
Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/kernels/funcs/sparse/sparse_blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,6 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
#if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
#endif
#if defined(PADDLE_WITH_MUSA)
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h"
#endif
4 changes: 4 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ __global__ void VectorizedDropoutForward(const size_t n,
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = hiprandStatePhilox4_32_10_t;
#elif defined(PADDLE_WITH_MUSA)
murand_state_philox4x32_10 state;
murand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = murand_state_philox4x32_10;
#else
curandStatePhilox4_32_10_t state;
curand_init(seed, idx + THREAD_ID_X, increment, &state);
Expand Down
6 changes: 5 additions & 1 deletion paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,18 @@
#include <hip/hip_runtime.h>
#include <hiprand_kernel.h>
#endif
#ifdef PADDLE_WITH_MUSA
#include <musa_runtime.h>
#include <murand_kernel.h>
#endif

#include "paddle/phi/kernels/funcs/aligned_vector.h"

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)

#ifdef PADDLE_WITH_HIP
#define WARP_SIZE 64
#else
#else // MUSA & CUDA
#define WARP_SIZE 32
#endif

Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/activation_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ void HardSwishGradKernel(const Context& dev_ctx,

} // namespace phi

#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
PD_REGISTER_KERNEL(relu_grad,
GPU,
ALL_LAYOUT,
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/activation_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ PD_REGISTER_KERNEL(relu,
float,
double,
phi::dtype::float16) {}
#else
#else // CUDA & MUSA
PD_REGISTER_KERNEL(relu,
GPU,
ALL_LAYOUT,
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/gpu/allclose_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ void AllCloseKernel(const Context& dev_ctx,
grid = (grid > block) ? block : grid;
#ifdef PADDLE_WITH_HIP
hipMemset(out_data, true, sizeof(bool));
#elif defined(PADDLE_WITH_MUSA)
musaMemset(out_data, true, sizeof(bool));
#else
cudaMemset(out_data, true, sizeof(bool));
#endif
Expand Down
Loading

0 comments on commit 8d14d0b

Please sign in to comment.