[Sparse] Add sparse matmul kernel(coo*dense->dense) (#44346)

PaddlePaddle · Jul 18, 2022 · 3f70b1d · 3f70b1d
1 parent c6bf881
commit 3f70b1d
Show file tree

Hide file tree

Showing 15 changed files with 293 additions and 251 deletions.
diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
@@ -28,6 +28,10 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
+#ifdef CUSPARSE_ROUTINE_EACH_R3
+CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
@@ -297,18 +297,18 @@
   args : (Tensor x, Tensor y, Tensor mask)
   output : Tensor(out)
   kernel :
-    func : csr_masked_matmul{dense, dense, sparse_csr -> sparse_csr}
+    func : masked_matmul_csr{dense, dense, sparse_csr -> sparse_csr}
     layout : x
   backward: masked_matmul_grad
 
 - api: matmul
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   kernel :
-    func : csr_dense_matmul{sparse_csr, dense -> dense},
-           csr_csr_matmul{sparse_csr, sparse_csr -> sparse_csr},
-           coo_dense_matmul{sparse_coo, dense -> dense},
-           coo_coo_matmul{sparse_coo, sparse_coo -> sparse_coo}
+    func : matmul_csr_dense {sparse_csr, dense -> dense},
+           matmul_csr_csr {sparse_csr, sparse_csr -> sparse_csr},
+           matmul_coo_dense {sparse_coo, dense -> dense},
+           matmul_coo_coo {sparse_coo, sparse_coo -> sparse_coo}
     layout : x
   backward: matmul_grad
 

diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -125,14 +125,17 @@
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
   kernel :
-    func : csr_masked_matmul_grad{dense, dense, sparse_csr -> dense, dense}
+    func : masked_matmul_csr_grad{dense, dense, sparse_csr -> dense, dense}
 
 - backward_api : matmul_grad
   forward : matmul(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
   kernel :
-    func : csr_dense_matmul_grad{sparse_csr, dense, dense -> sparse_csr, dense}
+    func : matmul_csr_dense_grad {sparse_csr, dense, dense -> sparse_csr, dense},
+           matmul_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr},
+           matmul_coo_dense_grad {sparse_coo, dense, dense -> sparse_coo, dense},
+           matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}
 
 - backward_api : multiply_grad
   forward : multiply(Tensor x, Tensor y) -> Tensor(out)

diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
@@ -30,5 +30,9 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
+#ifdef CUSPARSE_ROUTINE_EACH_R3
+CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -298,6 +298,7 @@ class CuSparseDnVecDescriptor {
   cusparseDnVecDescr_t descriptor_;
 };
 
+/************* SPARSE*DENSE->DENSE MATMUL ************/
 template <>
 template <typename T, typename TensorType>
 void SparseBlas<phi::GPUContext>::SPMM(bool transa,
@@ -345,6 +346,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
   });
 }
 
+/************* SPARSE*DENSE->DENSE MV ************/
 template <>
 template <typename T, typename TensorType>
 void SparseBlas<phi::GPUContext>::SPMV(bool transa,
@@ -389,6 +391,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
   });
 }
 
+/************* DENSE*DENSE->SPARSE MATMUL ************/
 #if CUDA_VERSION >= 11030
 template <>
 template <typename T, typename TensorType>

diff --git a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
@@ -22,7 +22,7 @@ namespace sparse {
 
 // TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE"
 template <typename T, typename Context>
-void CsrDenseMatmulGradKernel(const Context& dev_ctx,
+void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                               const SparseCsrTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
@@ -34,7 +34,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
 
 // TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
 template <typename T, typename Context>
-void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
+void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& y,
                                const SparseCsrTensor& dout,
@@ -47,18 +47,18 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul_grad,
+PD_REGISTER_KERNEL(matmul_csr_dense_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulGradKernel,
+                   phi::sparse::MatmulCsrDenseGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul_grad,
+PD_REGISTER_KERNEL(masked_matmul_csr_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulGradKernel,
+                   phi::sparse::MaskedMatmulCsrGradKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
@@ -22,7 +22,7 @@ namespace sparse {
 
 // TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE"
 template <typename T, typename Context>
-void CsrDenseMatmulKernel(const Context& dev_ctx,
+void MatmulCsrDenseKernel(const Context& dev_ctx,
                           const SparseCsrTensor& x,
                           const DenseTensor& y,
                           DenseTensor* out) {
@@ -32,7 +32,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
 
 // TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
 template <typename T, typename Context>
-void CsrMaskedMatmulKernel(const Context& dev_ctx,
+void MaskedMatmulCsrKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const SparseCsrTensor& mask,
@@ -44,18 +44,18 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul,
+PD_REGISTER_KERNEL(matmul_csr_dense,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulKernel,
+                   phi::sparse::MatmulCsrDenseKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul,
+PD_REGISTER_KERNEL(masked_matmul_csr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulKernel,
+                   phi::sparse::MaskedMatmulCsrKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -26,37 +26,27 @@ template <typename T, typename Context>
 void EmptyLikeCooKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         SparseCooTensor* out) {
-  const DenseTensor& x_indices = x.non_zero_indices();
+  out->set_dims(x.dims());
+  *(out->mutable_non_zero_indices()) = x.non_zero_indices();
+
   const DenseTensor& x_values = x.non_zero_elements();
-  DenseTensor* out_indices = out->mutable_non_zero_indices();
   DenseTensor* out_values = out->mutable_non_zero_elements();
-
-  phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices);
-
   out_values->Resize(x_values.dims());
   dev_ctx.template Alloc<T>(out_values);
-
-  out->set_dims(x.dims());
 }
 
 template <typename T, typename Context>
 void EmptyLikeCsrKernel(const Context& dev_ctx,
                         const SparseCsrTensor& x,
                         SparseCsrTensor* out) {
-  const DenseTensor& x_crows = x.non_zero_crows();
-  const DenseTensor& x_cols = x.non_zero_cols();
+  out->set_dims(x.dims());
+  *(out->mutable_non_zero_crows()) = x.non_zero_crows();
+  *(out->mutable_non_zero_cols()) = x.non_zero_cols();
+
   const DenseTensor& x_values = x.non_zero_elements();
-  DenseTensor* out_crows = out->mutable_non_zero_crows();
-  DenseTensor* out_cols = out->mutable_non_zero_cols();
   DenseTensor* out_values = out->mutable_non_zero_elements();
-
-  phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows);
-  phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols);
-
   out_values->Resize(x_values.dims());
   dev_ctx.template Alloc<T>(out_values);
-
-  out->set_dims(x.dims());
 }
 
 }  // namespace sparse

diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -22,13 +22,52 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
 namespace sparse {
 
 template <typename T, typename Context>
-void CsrDenseMatmulGradKernel(const Context& dev_ctx,
+void MatmulCooDenseGradKernel(const Context& dev_ctx,
+                              const SparseCooTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              SparseCooTensor* dx,
+                              DenseTensor* dy) {
+#if CUDA_VERSION >= 11030
+  auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
+
+  // dx{SparseCoo} = dout{Dense} * y'{Dense}
+  if (dx) {
+    // 'cusparseSDDMM' only support CSR now, so use COO->CSR->COO,
+    // which will increase some expenses.
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+    SparseCsrTensor dx_csr = SparseCooToCsr<T, Context>(dev_ctx, *dx);
+    sparse_blas.SDDMM(
+        false, true, static_cast<T>(1), dout, y, static_cast<T>(0), &dx_csr);
+    SparseCsrToCooKernel<T, Context>(dev_ctx, dx_csr, dx);
+  }
+
+  // dy{Dense} = x'{SparseCoo} * dout{Dense}
+  if (dy) {
+    MetaTensor meta_dy(dy);
+    meta_dy.set_dims(y.dims());
+    meta_dy.set_dtype(y.dtype());
+    dev_ctx.template Alloc<T>(dy);
+
+    sparse_blas.SPMM(
+        true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
+  }
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
+      "CUDA 11.3"));
+#endif
+}
+
+template <typename T, typename Context>
+void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                               const SparseCsrTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
@@ -66,7 +105,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
+void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& y,
                                const SparseCsrTensor& dout,
@@ -119,18 +158,27 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul_grad,
+PD_REGISTER_KERNEL(matmul_coo_dense_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MatmulCooDenseGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(matmul_csr_dense_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulGradKernel,
+                   phi::sparse::MatmulCsrDenseGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul_grad,
+PD_REGISTER_KERNEL(masked_matmul_csr_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulGradKernel,
+                   phi::sparse::MaskedMatmulCsrGradKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
@@ -31,11 +31,11 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
-template <typename T, typename Context>
-void CsrDenseMatmulKernel(const Context& dev_ctx,
-                          const SparseCsrTensor& x,
-                          const DenseTensor& y,
-                          DenseTensor* out) {
+template <typename T, typename Context, typename TensorType>
+void MatmulKernelImpl(const Context& dev_ctx,
+                      const TensorType& x,
+                      const DenseTensor& y,
+                      DenseTensor* out) {
 #if CUDA_VERSION >= 11000
   std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
   std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
@@ -91,7 +91,23 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CsrMaskedMatmulKernel(const Context& dev_ctx,
+void MatmulCooDenseKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  MatmulKernelImpl<T>(dev_ctx, x, y, out);
+}
+
+template <typename T, typename Context>
+void MatmulCsrDenseKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  MatmulKernelImpl<T>(dev_ctx, x, y, out);
+}
+
+template <typename T, typename Context>
+void MaskedMatmulCsrKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const SparseCsrTensor& mask,
@@ -176,18 +192,27 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul,
+PD_REGISTER_KERNEL(matmul_csr_dense,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulKernel,
+                   phi::sparse::MatmulCsrDenseKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul,
+PD_REGISTER_KERNEL(matmul_coo_dense,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MatmulCooDenseKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(masked_matmul_csr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulKernel,
+                   phi::sparse::MaskedMatmulCsrKernel,
                    float,
                    double) {}