Skip to content

Commit

Permalink
[Sparse] Add sparse matmul kernel(coo*dense->dense) (#44346)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhwesky2010 authored Jul 18, 2022
1 parent c6bf881 commit 3f70b1d
Show file tree
Hide file tree
Showing 15 changed files with 293 additions and 251 deletions.
4 changes: 4 additions & 0 deletions paddle/fluid/platform/dynload/cusparse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R3
CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif

} // namespace dynload
} // namespace platform
} // namespace paddle
10 changes: 5 additions & 5 deletions paddle/phi/api/yaml/sparse_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,18 +297,18 @@
args : (Tensor x, Tensor y, Tensor mask)
output : Tensor(out)
kernel :
func : csr_masked_matmul{dense, dense, sparse_csr -> sparse_csr}
func : masked_matmul_csr{dense, dense, sparse_csr -> sparse_csr}
layout : x
backward: masked_matmul_grad

- api: matmul
args : (Tensor x, Tensor y)
output : Tensor(out)
kernel :
func : csr_dense_matmul{sparse_csr, dense -> dense},
csr_csr_matmul{sparse_csr, sparse_csr -> sparse_csr},
coo_dense_matmul{sparse_coo, dense -> dense},
coo_coo_matmul{sparse_coo, sparse_coo -> sparse_coo}
func : matmul_csr_dense {sparse_csr, dense -> dense},
matmul_csr_csr {sparse_csr, sparse_csr -> sparse_csr},
matmul_coo_dense {sparse_coo, dense -> dense},
matmul_coo_coo {sparse_coo, sparse_coo -> sparse_coo}
layout : x
backward: matmul_grad

Expand Down
7 changes: 5 additions & 2 deletions paddle/phi/api/yaml/sparse_bw_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,17 @@
args : (Tensor x, Tensor y, Tensor out_grad)
output : Tensor(x_grad), Tensor(y_grad)
kernel :
func : csr_masked_matmul_grad{dense, dense, sparse_csr -> dense, dense}
func : masked_matmul_csr_grad{dense, dense, sparse_csr -> dense, dense}

- backward_api : matmul_grad
forward : matmul(Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad)
output : Tensor(x_grad), Tensor(y_grad)
kernel :
func : csr_dense_matmul_grad{sparse_csr, dense, dense -> sparse_csr, dense}
func : matmul_csr_dense_grad {sparse_csr, dense, dense -> sparse_csr, dense},
matmul_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr},
matmul_coo_dense_grad {sparse_coo, dense, dense -> sparse_coo, dense},
matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}

- backward_api : multiply_grad
forward : multiply(Tensor x, Tensor y) -> Tensor(out)
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/backends/dynload/cusparse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,9 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R3
CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif

} // namespace dynload
} // namespace phi
3 changes: 3 additions & 0 deletions paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ class CuSparseDnVecDescriptor {
cusparseDnVecDescr_t descriptor_;
};

/************* SPARSE*DENSE->DENSE MATMUL ************/
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SPMM(bool transa,
Expand Down Expand Up @@ -345,6 +346,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
});
}

/************* SPARSE*DENSE->DENSE MV ************/
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SPMV(bool transa,
Expand Down Expand Up @@ -389,6 +391,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
});
}

/************* DENSE*DENSE->SPARSE MATMUL ************/
#if CUDA_VERSION >= 11030
template <>
template <typename T, typename TensorType>
Expand Down
12 changes: 6 additions & 6 deletions paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace sparse {

// TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE"
template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
void MatmulCsrDenseGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
Expand All @@ -34,7 +34,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,

// TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
Expand All @@ -47,18 +47,18 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul_grad,
PD_REGISTER_KERNEL(matmul_csr_dense_grad,
CPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulGradKernel,
phi::sparse::MatmulCsrDenseGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul_grad,
PD_REGISTER_KERNEL(masked_matmul_csr_grad,
CPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulGradKernel,
phi::sparse::MaskedMatmulCsrGradKernel,
float,
double) {}
12 changes: 6 additions & 6 deletions paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace sparse {

// TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE"
template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
void MatmulCsrDenseKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
Expand All @@ -32,7 +32,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,

// TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
void MaskedMatmulCsrKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
Expand All @@ -44,18 +44,18 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul,
PD_REGISTER_KERNEL(matmul_csr_dense,
CPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulKernel,
phi::sparse::MatmulCsrDenseKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul,
PD_REGISTER_KERNEL(masked_matmul_csr,
CPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulKernel,
phi::sparse::MaskedMatmulCsrKernel,
float,
double) {}
24 changes: 7 additions & 17 deletions paddle/phi/kernels/sparse/empty_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,37 +26,27 @@ template <typename T, typename Context>
void EmptyLikeCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
SparseCooTensor* out) {
const DenseTensor& x_indices = x.non_zero_indices();
out->set_dims(x.dims());
*(out->mutable_non_zero_indices()) = x.non_zero_indices();

const DenseTensor& x_values = x.non_zero_elements();
DenseTensor* out_indices = out->mutable_non_zero_indices();
DenseTensor* out_values = out->mutable_non_zero_elements();

phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices);

out_values->Resize(x_values.dims());
dev_ctx.template Alloc<T>(out_values);

out->set_dims(x.dims());
}

template <typename T, typename Context>
void EmptyLikeCsrKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCsrTensor* out) {
const DenseTensor& x_crows = x.non_zero_crows();
const DenseTensor& x_cols = x.non_zero_cols();
out->set_dims(x.dims());
*(out->mutable_non_zero_crows()) = x.non_zero_crows();
*(out->mutable_non_zero_cols()) = x.non_zero_cols();

const DenseTensor& x_values = x.non_zero_elements();
DenseTensor* out_crows = out->mutable_non_zero_crows();
DenseTensor* out_cols = out->mutable_non_zero_cols();
DenseTensor* out_values = out->mutable_non_zero_elements();

phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows);
phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols);

out_values->Resize(x_values.dims());
dev_ctx.template Alloc<T>(out_values);

out->set_dims(x.dims());
}

} // namespace sparse
Expand Down
60 changes: 54 additions & 6 deletions paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,52 @@ limitations under the License. */
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"

namespace phi {
namespace sparse {

template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
void MatmulCooDenseGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
SparseCooTensor* dx,
DenseTensor* dy) {
#if CUDA_VERSION >= 11030
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);

// dx{SparseCoo} = dout{Dense} * y'{Dense}
if (dx) {
// 'cusparseSDDMM' only support CSR now, so use COO->CSR->COO,
// which will increase some expenses.
EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
SparseCsrTensor dx_csr = SparseCooToCsr<T, Context>(dev_ctx, *dx);
sparse_blas.SDDMM(
false, true, static_cast<T>(1), dout, y, static_cast<T>(0), &dx_csr);
SparseCsrToCooKernel<T, Context>(dev_ctx, dx_csr, dx);
}

// dy{Dense} = x'{SparseCoo} * dout{Dense}
if (dy) {
MetaTensor meta_dy(dy);
meta_dy.set_dims(y.dims());
meta_dy.set_dtype(y.dtype());
dev_ctx.template Alloc<T>(dy);

sparse_blas.SPMM(
true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
}
#else
PADDLE_THROW(phi::errors::Unimplemented(
"backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
"CUDA 11.3"));
#endif
}

template <typename T, typename Context>
void MatmulCsrDenseGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
Expand Down Expand Up @@ -66,7 +105,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
}

template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
Expand Down Expand Up @@ -119,18 +158,27 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul_grad,
PD_REGISTER_KERNEL(matmul_coo_dense_grad,
GPU,
ALL_LAYOUT,
phi::sparse::MatmulCooDenseGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}

PD_REGISTER_KERNEL(matmul_csr_dense_grad,
GPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulGradKernel,
phi::sparse::MatmulCsrDenseGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul_grad,
PD_REGISTER_KERNEL(masked_matmul_csr_grad,
GPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulGradKernel,
phi::sparse::MaskedMatmulCsrGradKernel,
float,
double) {}
45 changes: 35 additions & 10 deletions paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ limitations under the License. */
namespace phi {
namespace sparse {

template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
template <typename T, typename Context, typename TensorType>
void MatmulKernelImpl(const Context& dev_ctx,
const TensorType& x,
const DenseTensor& y,
DenseTensor* out) {
#if CUDA_VERSION >= 11000
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
Expand Down Expand Up @@ -91,7 +91,23 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
}

template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
void MatmulCooDenseKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& y,
DenseTensor* out) {
MatmulKernelImpl<T>(dev_ctx, x, y, out);
}

template <typename T, typename Context>
void MatmulCsrDenseKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
MatmulKernelImpl<T>(dev_ctx, x, y, out);
}

template <typename T, typename Context>
void MaskedMatmulCsrKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
Expand Down Expand Up @@ -176,18 +192,27 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul,
PD_REGISTER_KERNEL(matmul_csr_dense,
GPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulKernel,
phi::sparse::MatmulCsrDenseKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul,
PD_REGISTER_KERNEL(matmul_coo_dense,
GPU,
ALL_LAYOUT,
phi::sparse::MatmulCooDenseKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}

PD_REGISTER_KERNEL(masked_matmul_csr,
GPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulKernel,
phi::sparse::MaskedMatmulCsrKernel,
float,
double) {}
Loading

0 comments on commit 3f70b1d

Please sign in to comment.