From 4641b7fab4cc4bbf5054ee3bb36b6af01c14a76c Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Sun, 5 Nov 2023 08:06:00 +0000 Subject: [PATCH 01/25] add cusparseSpGEMM --- paddle/fluid/platform/dynload/cusparse.h | 46 +-- paddle/phi/backends/dynload/cusparse.h | 46 +-- paddle/phi/kernels/funcs/sparse/sparse_blas.h | 14 + .../funcs/sparse/sparse_blas_impl.cu.h | 130 +++++++- .../funcs/sparse/sparse_blas_impl.hip.h | 76 +++++ .../phi/kernels/sparse/gpu/matmul_kernel.cu | 113 +++++++ test/legacy_test/test_sparse_matmul_op.py | 285 ++++++++++-------- 7 files changed, 549 insertions(+), 161 deletions(-) diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index 74f9b973a388c..cbc09ed47c1bf 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -30,26 +30,32 @@ namespace dynload { #if defined(PADDLE_WITH_CUDA) #if CUDA_VERSION >= 11000 -#define CUSPARSE_ROUTINE_EACH(__macro) \ - __macro(cusparseCreate); \ - __macro(cusparseSetStream); \ - __macro(cusparseCreateMatDescr); \ - __macro(cusparseDestroy); \ - __macro(cusparseSnnz); \ - __macro(cusparseDnnz); \ - __macro(cusparseSetMatType); \ - __macro(cusparseSetMatIndexBase); \ - __macro(cusparseCreateCsr); \ - __macro(cusparseCreateCoo); \ - __macro(cusparseCreateDnMat); \ - __macro(cusparseCreateDnVec); \ - __macro(cusparseSpMM_bufferSize); \ - __macro(cusparseSpMM); \ - __macro(cusparseDestroySpMat); \ - __macro(cusparseDestroyDnMat); \ - __macro(cusparseDestroyDnVec); \ - __macro(cusparseSpMV_bufferSize); \ - __macro(cusparseSpMV); +#define CUSPARSE_ROUTINE_EACH(__macro) \ + __macro(cusparseCreate); \ + __macro(cusparseSetStream); \ + __macro(cusparseCreateMatDescr); \ + __macro(cusparseDestroy); \ + __macro(cusparseSnnz); \ + __macro(cusparseDnnz); \ + __macro(cusparseSetMatType); \ + __macro(cusparseSetMatIndexBase); \ + __macro(cusparseCreateCsr); \ + __macro(cusparseCreateCoo); \ + __macro(cusparseCreateDnMat); \ + __macro(cusparseCreateDnVec); \ + __macro(cusparseSpMM_bufferSize); \ + __macro(cusparseSpMM); \ + __macro(cusparseDestroySpMat); \ + __macro(cusparseDestroyDnMat); \ + __macro(cusparseDestroyDnVec); \ + __macro(cusparseSpMV_bufferSize); \ + __macro(cusparseSpMV); \ + __macro(cusparseSpMatGetSize); \ + __macro(cusparseSpGEMM_createDescr); \ + __macro(cusparseSpGEMM_compute); \ + __macro(cusparseSpGEMM_workEstimation); \ + __macro(cusparseSpGEMM_copy); \ + __macro(cusparseSpGEMM_destroyDescr); CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h index fcbabd55b7ebb..6d90cb5eefdca 100644 --- a/paddle/phi/backends/dynload/cusparse.h +++ b/paddle/phi/backends/dynload/cusparse.h @@ -42,26 +42,32 @@ extern void *cusparse_dso_handle; #if defined(PADDLE_WITH_CUDA) #if CUDA_VERSION >= 11000 -#define CUSPARSE_ROUTINE_EACH(__macro) \ - __macro(cusparseCreate); \ - __macro(cusparseSetStream); \ - __macro(cusparseCreateMatDescr); \ - __macro(cusparseDestroy); \ - __macro(cusparseSnnz); \ - __macro(cusparseDnnz); \ - __macro(cusparseSetMatType); \ - __macro(cusparseSetMatIndexBase); \ - __macro(cusparseCreateCsr); \ - __macro(cusparseCreateCoo); \ - __macro(cusparseCreateDnMat); \ - __macro(cusparseCreateDnVec); \ - __macro(cusparseSpMM_bufferSize); \ - __macro(cusparseSpMM); \ - __macro(cusparseDestroySpMat); \ - __macro(cusparseDestroyDnMat); \ - __macro(cusparseDestroyDnVec); \ - __macro(cusparseSpMV_bufferSize); \ - __macro(cusparseSpMV); +#define CUSPARSE_ROUTINE_EACH(__macro) \ + __macro(cusparseCreate); \ + __macro(cusparseSetStream); \ + __macro(cusparseCreateMatDescr); \ + __macro(cusparseDestroy); \ + __macro(cusparseSnnz); \ + __macro(cusparseDnnz); \ + __macro(cusparseSetMatType); \ + __macro(cusparseSetMatIndexBase); \ + __macro(cusparseCreateCsr); \ + __macro(cusparseCreateCoo); \ + __macro(cusparseCreateDnMat); \ + __macro(cusparseCreateDnVec); \ + __macro(cusparseSpMM_bufferSize); \ + __macro(cusparseSpMM); \ + __macro(cusparseDestroySpMat); \ + __macro(cusparseDestroyDnMat); \ + __macro(cusparseDestroyDnVec); \ + __macro(cusparseSpMV_bufferSize); \ + __macro(cusparseSpMV); \ + __macro(cusparseSpMatGetSize); \ + __macro(cusparseSpGEMM_createDescr); \ + __macro(cusparseSpGEMM_compute); \ + __macro(cusparseSpGEMM_workEstimation); \ + __macro(cusparseSpGEMM_copy); \ + __macro(cusparseSpGEMM_destroyDescr); CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h index f6d67488d1f48..87246fa006462 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h @@ -54,6 +54,15 @@ class SparseBlas { T beta, TensorType* mat_out) const; + template + void SPGEMM(bool transa, + bool transb, + T alpha, + const SparseCsrTensor& mat_a, + const SparseCsrTensor& mat_b, + T beta, + SparseCsrTensor* mat_out) const; + private: const DeviceContext& dev_ctx_; }; @@ -78,6 +87,11 @@ class SparseBlasT : private SparseBlas { Base()->template SDDMM(args...); } + template + void SPGEMM(ARGS... args) const { + Base()->template SPGEMM(args...); + } + private: const SparseBlas* Base() const { return static_cast*>(this); diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index fde5cb1768d47..54758d21d84d8 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -97,8 +97,8 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, const_cast(crows_data), const_cast(cols_data), const_cast(values_data), - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, gpu_type); }); @@ -151,7 +151,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, const_cast(rows_data), const_cast(cols_data), const_cast(values_data), - CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, gpu_type); }); @@ -481,6 +481,130 @@ void SparseBlas::SDDMM(bool transa, } #endif +/************* SPARSE*SPARSE->SPARSE MATMUL ************/ +template <> +template +void SparseBlas::SPGEMM(bool transa, + bool transb, + T alpha, + const SparseCsrTensor& mat_a, + const SparseCsrTensor& mat_b, + T beta, + SparseCsrTensor* mat_out) const { + auto a_descriptor = CuSparseSpMatDescriptor(mat_a, dev_ctx_); + auto b_descriptor = CuSparseSpMatDescriptor(mat_b, dev_ctx_); + auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + + cudaDataType_t gpu_type = GetGpuDataType(); + size_t buffer_a_size = 0, buffer_b_size = 0; + cusparseSpGEMMDescr_t spgemmDesc; + phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_workEstimation(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_descriptor.descriptor(), + b_descriptor.descriptor(), + &beta, + out_descriptor.descriptor(), + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc, + &buffer_a_size, + nullptr); + }); + + phi::Allocator::AllocationPtr tmp_buffer_a = phi::memory_utils::Alloc( + dev_ctx_.GetPlace(), + buffer_a_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); + void* tmp_buffer_a_ptr = tmp_buffer_a->ptr(); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_workEstimation(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_descriptor.descriptor(), + b_descriptor.descriptor(), + &beta, + out_descriptor.descriptor(), + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc, + &buffer_a_size, + tmp_buffer_a_ptr); + }); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_compute(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_descriptor.descriptor(), + b_descriptor.descriptor(), + &beta, + out_descriptor.descriptor(), + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc, + &buffer_b_size, + nullptr); + }); + + phi::Allocator::AllocationPtr tmp_buffer_b = phi::memory_utils::Alloc( + dev_ctx_.GetPlace(), + buffer_b_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); + void* tmp_buffer_b_ptr = tmp_buffer_b->ptr(); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_compute(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_descriptor.descriptor(), + b_descriptor.descriptor(), + &beta, + out_descriptor.descriptor(), + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc, + &buffer_b_size, + tmp_buffer_b_ptr); + }); + + // get matrix C non-zero entries C_nnz1 + int64_t C_num_rows1, C_num_cols1, C_nnz1; + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpMatGetSize( + out_descriptor.descriptor(), &C_num_rows1, &C_num_cols1, &C_nnz1); + }); + VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_copy(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_descriptor.descriptor(), + b_descriptor.descriptor(), + &beta, + out_descriptor.descriptor(), + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc); + }); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpMatGetSize( + out_descriptor.descriptor(), &C_num_rows1, &C_num_cols1, &C_nnz1); + }); + VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; +} } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h index cbd42be3cb6d4..e2b5c9dfdc02b 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h @@ -400,6 +400,82 @@ void SparseBlas::SDDMM(bool transa, }); } #endif + +/************* SPARSE*SPARSE->SPARSE MATMUL ************/ +template <> +template +void SparseBlas::SPGEMM(bool transa, + bool transb, + T alpha, + const SparseCsrTensor& mat_a, + const SparseCsrTensor& mat_b, + T beta, + SparseCsrTensor* mat_out) const { + // auto a_descriptor = RocSparseSpMatDescriptor(mat_a, dev_ctx_); + // auto b_descriptor = RocSparseSpMatDescriptor(mat_b, dev_ctx_); + // auto out_descriptor = RocSparseSpMatDescriptor(*mat_out, dev_ctx_); + + // rocsparse_datatype ttype = GetGpuDataType(); + // size_t buffer_size = 0; + + // // Query SpMM buffer + // dev_ctx_.CusparseCall([&](rocsparse_handle handle) { + // phi::dynload::rocsparse_spmm(handle, + // GetTransposeOperation(transa), + // GetTransposeOperation(transb), + // &alpha, + // a_descriptor.descriptor(), + // b_descriptor.descriptor(), + // &beta, + // out_descriptor.descriptor(), + // ttype, + // GetSpMMAlgorithm(mat_a), + // rocsparse_spmm_stage_buffer_size, + // &buffer_size, + // nullptr); + // }); + + // // Allocate buffer + // phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc( + // dev_ctx_.GetPlace(), + // buffer_size, + // phi::Stream(reinterpret_cast(dev_ctx_.stream()))); + // void* tmp_buffer_ptr = tmp_buffer->ptr(); + + // // Preprocess data + // dev_ctx_.CusparseCall([&](rocsparse_handle handle) { + // phi::dynload::rocsparse_spmm(handle, + // GetTransposeOperation(transa), + // GetTransposeOperation(transb), + // &alpha, + // a_descriptor.descriptor(), + // b_descriptor.descriptor(), + // &beta, + // out_descriptor.descriptor(), + // ttype, + // GetSpMMAlgorithm(mat_a), + // rocsparse_spmm_stage_preprocess, + // &buffer_size, + // tmp_buffer_ptr); + // }); + + // // Performs the actual SpMM computation + // dev_ctx_.CusparseCall([&](rocsparse_handle handle) { + // phi::dynload::rocsparse_spmm(handle, + // GetTransposeOperation(transa), + // GetTransposeOperation(transb), + // &alpha, + // a_descriptor.descriptor(), + // b_descriptor.descriptor(), + // &beta, + // out_descriptor.descriptor(), + // ttype, + // GetSpMMAlgorithm(mat_a), + // rocsparse_spmm_stage_compute, + // &buffer_size, + // tmp_buffer_ptr); + // }); +} } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index f39209e9b8604..553f466b05b2e 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -102,6 +102,85 @@ void MatmulKernelImpl(const Context& dev_ctx, #endif } +template +void MatmulKernelImpl(const Context& dev_ctx, + const SparseCsrTensor& x, + const SparseCsrTensor& y, + SparseCsrTensor* out) { +#if CUDA_VERSION >= 11000 || HIP_VERSION >= 402 + std::vector xdim_vec = phi::vectorize(x.dims()); + std::vector ydim_vec = phi::vectorize(y.dims()); + auto x_ndims = xdim_vec.size(); + auto y_ndims = ydim_vec.size(); + PADDLE_ENFORCE_EQ( + x_ndims, + y_ndims, + phi::errors::PreconditionNotMet("The dims size of Input(x) and Input(y) " + "should be equal, But received X's " + "dimensions=%d, Y's dimensions=%d.", + x_ndims, + y_ndims)); + PADDLE_ENFORCE_GE( + x_ndims, + 2, + phi::errors::InvalidArgument("the dims size of Input(x) and " + "Input(y) must be greater than " + "or eaqual to 2.")); + + for (size_t i = 0; i < x_ndims - 2; ++i) { + PADDLE_ENFORCE_EQ(xdim_vec[i], + ydim_vec[i], + phi::errors::InvalidArgument( + "x.dim[%d] and x.dim[%d] must be eaqul.", i, i)); + } + + PADDLE_ENFORCE_GE( + xdim_vec[x_ndims - 1], + ydim_vec[y_ndims - 2], + phi::errors::PreconditionNotMet( + "The shape of Input(x) and Input(y) is not suitable for matmul " + "opetation, x_dim[-1] must be eaqual to y_dim[-2].")); + // InferMeta of 'out' + // std::vector out_dim_vec(ydim_vec); + // out_dim_vec[y_ndims - 2] = xdim_vec[x_ndims - 2]; + // out_dim_vec[y_ndims - 1] = ydim_vec[y_ndims - 1]; + // MetaTensor meta_out(out); + + // meta_out.set_dims(phi::make_ddim(out_dim_vec)); + // meta_out.set_dtype(x.dtype()); + + // dev_ctx.template Alloc(out); + VLOG(0) << "tyep " << out->crows().dtype(); + // *(out->mutable_crows()) = x.crows(); + // *(out->mutable_cols()) = x.cols(); + + // const DenseTensor& x_values = x.values(); + // DenseTensor* out_values = out->mutable_values(); + // out_values->Resize(x_values.dims()); + // out->set_meta(x.meta()); + // dev_ctx.template Alloc(out_values); + +#ifdef PADDLE_WITH_HIP + phi::funcs::SetConstant set_zero; + // set_zero(dev_ctx, out, static_cast(0.0f)); +#endif + + auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); + sparse_blas.SPGEMM( + false, false, static_cast(1), x, y, static_cast(0), out); +#else +#ifdef PADDLE_WITH_CUDA + PADDLE_THROW( + phi::errors::Unimplemented("forward of 'sparse.matmul' use cusparseSpMM, " + "which is supported from CUDA 11.0")); +#elif defined(PADDLE_WITH_HIP) + PADDLE_THROW(phi::errors::Unimplemented( + "forward of 'sparse.matmul' use rocsparse_spmm, " + "which is supported from ROCM 4.2.0")); +#endif +#endif +} + template void MatmulCooDenseKernel(const Context& dev_ctx, const SparseCooTensor& x, @@ -118,6 +197,22 @@ void MatmulCsrDenseKernel(const Context& dev_ctx, MatmulKernelImpl(dev_ctx, x, y, out); } +template +void MatmulCooCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const SparseCooTensor& y, + SparseCooTensor* out) { + // MatmulKernelImpl(dev_ctx, x, y, out); +} + +template +void MatmulCsrCsrKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const SparseCsrTensor& y, + SparseCsrTensor* out) { + MatmulKernelImpl(dev_ctx, x, y, out); +} + template void MaskedMatmulCsrKernel(const Context& dev_ctx, const DenseTensor& x, @@ -222,6 +317,24 @@ PD_REGISTER_KERNEL(matmul_coo_dense, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } +PD_REGISTER_KERNEL(matmul_coo_coo, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCooCooKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(matmul_csr_csr, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCsrCsrKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} + PD_REGISTER_KERNEL(masked_matmul_csr, GPU, ALL_LAYOUT, diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index eb608dd379cac..cbb76d5c8e9ec 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -16,9 +16,6 @@ import re import unittest -import numpy as np -import scipy.sparse as sp - import paddle paddle.set_default_dtype('float64') @@ -36,14 +33,67 @@ def get_cuda_version(): return -1 -class TestMatmul(unittest.TestCase): +# class TestMatmul(unittest.TestCase): +# # x: sparse, y: dense, out: dense +# def check_result(self, x_shape, y_shape, format): +# if len(x_shape) == 3: +# mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]]) +# else: +# mask = paddle.randint(0, 2, x_shape) +# origin_x = paddle.rand(x_shape) * mask +# origin_y = paddle.rand(y_shape) + +# dense_x = origin_x.detach() +# dense_x.stop_gradient = False +# dense_y = origin_y.detach() +# dense_y.stop_gradient = False +# dense_out = paddle.matmul(dense_x, dense_y) + +# if format == "coo": +# sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) +# else: +# sp_x = origin_x.detach().to_sparse_csr() +# sp_x.stop_gradient = False +# sp_y = origin_y.detach() +# sp_y.stop_gradient = False +# sp_out = paddle.sparse.matmul(sp_x, sp_y) + +# np.testing.assert_allclose( +# sp_out.numpy(), dense_out.numpy(), rtol=1e-05 +# ) +# if get_cuda_version() >= 11030: +# dense_out.backward() +# sp_out.backward() +# np.testing.assert_allclose( +# sp_x.grad.to_dense().numpy(), +# (dense_x.grad * mask).numpy(), +# rtol=1e-05, +# ) +# np.testing.assert_allclose( +# sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 +# ) + +# @unittest.skipIf( +# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, +# "only support cuda>=11.0", +# ) +# def test_matmul_2d(self): +# self.check_result([16, 12], [12, 10], 'coo') +# self.check_result([16, 12], [12, 10], 'csr') + +# @unittest.skipIf( +# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, +# "only support cuda>=11.8", +# ) +# def test_matmul_3d(self): +# self.check_result([8, 16, 12], [8, 12, 10], 'coo') +# self.check_result([8, 16, 12], [8, 12, 10], 'csr') + + +class TestMatmul2(unittest.TestCase): # x: sparse, y: dense, out: dense - def check_result(self, x_shape, y_shape, format): - if len(x_shape) == 3: - mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]]) - else: - mask = paddle.randint(0, 2, x_shape) - origin_x = paddle.rand(x_shape) * mask + def check_result(self, x_shape, y_shape): + origin_x = paddle.rand(x_shape) origin_y = paddle.rand(y_shape) dense_x = origin_x.detach() @@ -51,123 +101,122 @@ def check_result(self, x_shape, y_shape, format): dense_y = origin_y.detach() dense_y.stop_gradient = False dense_out = paddle.matmul(dense_x, dense_y) - - if format == "coo": - sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) - else: - sp_x = origin_x.detach().to_sparse_csr() + print(dense_x) + print(dense_y) + print(dense_out) + sp_x = origin_x.detach().to_sparse_csr() + sp_y = origin_y.detach().to_sparse_csr() sp_x.stop_gradient = False - sp_y = origin_y.detach() sp_y.stop_gradient = False sp_out = paddle.sparse.matmul(sp_x, sp_y) - - np.testing.assert_allclose( - sp_out.numpy(), dense_out.numpy(), rtol=1e-05 - ) - if get_cuda_version() >= 11030: - dense_out.backward() - sp_out.backward() - np.testing.assert_allclose( - sp_x.grad.to_dense().numpy(), - (dense_x.grad * mask).numpy(), - rtol=1e-05, - ) - np.testing.assert_allclose( - sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 - ) + print(sp_out) + + # np.testing.assert_allclose( + # sp_out.numpy(), dense_out.numpy(), rtol=1e-05 + # ) + # if get_cuda_version() >= 11030: + # dense_out.backward() + # sp_out.backward() + # np.testing.assert_allclose( + # sp_x.grad.to_dense().numpy(), + # (dense_x.grad * mask).numpy(), + # rtol=1e-05, + # ) + # np.testing.assert_allclose( + # sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 + # ) @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, "only support cuda>=11.0", ) def test_matmul_2d(self): - self.check_result([16, 12], [12, 10], 'coo') - self.check_result([16, 12], [12, 10], 'csr') - - @unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, - "only support cuda>=11.8", - ) - def test_matmul_3d(self): - self.check_result([8, 16, 12], [8, 12, 10], 'coo') - self.check_result([8, 16, 12], [8, 12, 10], 'csr') - - -class TestMaskedMatmul(unittest.TestCase): - # x: dense, y: dense, out: sparse_`csr - @unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030, - "only support on cuda>=11.3", - ) - def test_masked_matmul_2d(self): - np_mask = np.random.rand(10, 6) < 0.2 - - np_x = np.random.rand(10, 12) - np_y = np.random.rand(12, 6) - np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask) - - np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask) - # dx(dense) = dout(csr) * y'(dense) - np_x_grad = np_out_grad @ np_y.transpose(1, 0) - # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense) - np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0) - - x = paddle.to_tensor(np_x, stop_gradient=False) - y = paddle.to_tensor(np_y, stop_gradient=False) - mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr() - out = paddle.sparse.masked_matmul(x, y, mask) - - np.testing.assert_allclose( - np_out.indptr, out.crows().numpy(), rtol=1e-05 - ) - np.testing.assert_allclose( - np_out.indices, out.cols().numpy(), rtol=1e-05 - ) - np.testing.assert_allclose( - np_out.data, out.values().numpy(), rtol=1e-05 - ) - - out.backward() - np.testing.assert_allclose(out.is_sparse_csr(), True, rtol=1e-05) - np.testing.assert_allclose(np_x_grad, x.grad.numpy(), rtol=1e-05) - np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05) - - @unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, - "only support on cuda>=11.8", - ) - def test_masked_matmul_3d(self): - paddle.set_default_dtype('float32') - origin_x = paddle.rand([16, 16, 12]) - mask = paddle.randint(0, 2, [16, 12]) - origin_x = origin_x * mask - origin_y = paddle.rand([16, 12, 10]) - - dense_x = origin_x.detach() - dense_x.stop_gradient = False - dense_y = origin_y.detach() - dense_y.stop_gradient = False - dense_out = paddle.matmul(dense_x, dense_y) - dense_out.backward() - - sp_x = origin_x.detach().to_sparse_csr() - sp_x.stop_gradient = False - sp_y = origin_y.detach() - sp_y.stop_gradient = False - sp_out = paddle.sparse.matmul(sp_x, sp_y) - sp_out.backward() - - np.testing.assert_allclose( - sp_out.numpy(), dense_out.numpy(), rtol=1e-05 - ) - np.testing.assert_allclose( - sp_x.grad.to_dense().numpy(), - (dense_x.grad * mask).numpy(), - rtol=1e-05, - ) - np.testing.assert_allclose( - sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 - ) + self.check_result([3, 4], [4, 5]) + + # @unittest.skipIf( + # not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, + # "only support cuda>=11.8", + # ) + # def test_matmul_3d(self): + # self.check_result([8, 16, 12], [8, 12, 10], 'coo') + # self.check_result([8, 16, 12], [8, 12, 10], 'csr') + + +# class TestMaskedMatmul(unittest.TestCase): +# # x: dense, y: dense, out: sparse_`csr +# @unittest.skipIf( +# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030, +# "only support on cuda>=11.3", +# ) +# def test_masked_matmul_2d(self): +# np_mask = np.random.rand(10, 6) < 0.2 + +# np_x = np.random.rand(10, 12) +# np_y = np.random.rand(12, 6) +# np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask) + +# np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask) +# # dx(dense) = dout(csr) * y'(dense) +# np_x_grad = np_out_grad @ np_y.transpose(1, 0) +# # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense) +# np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0) + +# x = paddle.to_tensor(np_x, stop_gradient=False) +# y = paddle.to_tensor(np_y, stop_gradient=False) +# mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr() +# out = paddle.sparse.masked_matmul(x, y, mask) + +# np.testing.assert_allclose( +# np_out.indptr, out.crows().numpy(), rtol=1e-05 +# ) +# np.testing.assert_allclose( +# np_out.indices, out.cols().numpy(), rtol=1e-05 +# ) +# np.testing.assert_allclose( +# np_out.data, out.values().numpy(), rtol=1e-05 +# ) + +# out.backward() +# np.testing.assert_allclose(out.is_sparse_csr(), True, rtol=1e-05) +# np.testing.assert_allclose(np_x_grad, x.grad.numpy(), rtol=1e-05) +# np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05) + +# @unittest.skipIf( +# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, +# "only support on cuda>=11.8", +# ) +# def test_masked_matmul_3d(self): +# paddle.set_default_dtype('float32') +# origin_x = paddle.rand([16, 16, 12]) +# mask = paddle.randint(0, 2, [16, 12]) +# origin_x = origin_x * mask +# origin_y = paddle.rand([16, 12, 10]) + +# dense_x = origin_x.detach() +# dense_x.stop_gradient = False +# dense_y = origin_y.detach() +# dense_y.stop_gradient = False +# dense_out = paddle.matmul(dense_x, dense_y) +# dense_out.backward() + +# sp_x = origin_x.detach().to_sparse_csr() +# sp_x.stop_gradient = False +# sp_y = origin_y.detach() +# sp_y.stop_gradient = False +# sp_out = paddle.sparse.matmul(sp_x, sp_y) +# sp_out.backward() + +# np.testing.assert_allclose( +# sp_out.numpy(), dense_out.numpy(), rtol=1e-05 +# ) +# np.testing.assert_allclose( +# sp_x.grad.to_dense().numpy(), +# (dense_x.grad * mask).numpy(), +# rtol=1e-05, +# ) +# np.testing.assert_allclose( +# sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 +# ) if __name__ == "__main__": From 782f92d3b739a441ee8602b3f519913570185626 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Fri, 10 Nov 2023 08:06:02 +0000 Subject: [PATCH 02/25] refine paddle.sparse.matmul --- test/legacy_test/test_sparse_matmul_op.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index cbb76d5c8e9ec..d3efa2056673c 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -93,8 +93,10 @@ def get_cuda_version(): class TestMatmul2(unittest.TestCase): # x: sparse, y: dense, out: dense def check_result(self, x_shape, y_shape): - origin_x = paddle.rand(x_shape) - origin_y = paddle.rand(y_shape) + # origin_x = paddle.rand(x_shape) + # origin_y = paddle.rand(y_shape) + origin_x = paddle.to_tensor([[1, -1], [0, 1]]) + origin_y = paddle.to_tensor([[1, 1], [1, 1]]) dense_x = origin_x.detach() dense_x.stop_gradient = False From 6c5dbb717897c27a4f5b3ea16128be193077bdb4 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Mon, 27 Nov 2023 13:30:50 +0000 Subject: [PATCH 03/25] test --- .../funcs/sparse/sparse_blas_impl.cu.h | 45 +++++++++++++++---- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 9 +++- test/legacy_test/test_sparse_matmul_op.py | 9 +++- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 54758d21d84d8..1f66581490119 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -493,13 +493,42 @@ void SparseBlas::SPGEMM(bool transa, SparseCsrTensor* mat_out) const { auto a_descriptor = CuSparseSpMatDescriptor(mat_a, dev_ctx_); auto b_descriptor = CuSparseSpMatDescriptor(mat_b, dev_ctx_); - auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + + cusparseSpMatDescr_t mat_out_descr; + // auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); cudaDataType_t gpu_type = GetGpuDataType(); size_t buffer_a_size = 0, buffer_b_size = 0; cusparseSpGEMMDescr_t spgemmDesc; + + std::vector xdim_vec = phi::vectorize(mat_a.dims()); + auto x_ndims = xdim_vec.size(); + std::vector ydim_vec = phi::vectorize(mat_b.dims()); + auto y_ndims = ydim_vec.size(); + int64_t M = xdim_vec[x_ndims - 2]; + int64_t N = ydim_vec[y_ndims - 1]; + phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); + // float* a; + // cudaMalloc((void**)&a, 10 * sizeof(float)); + // float* b; + // cudaMalloc((void**)&b, 10 * sizeof(float)); + // float* c; + // cudaMalloc((void**)&c, 10 * sizeof(float)); + + phi::dynload::cusparseCreateCsr(&mat_out_descr, + M, + N, + 0, + nullptr, + nullptr, + nullptr, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_workEstimation(handle, GetTransposeOperation(transa), @@ -508,7 +537,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descriptor.descriptor(), + mat_out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -530,7 +559,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descriptor.descriptor(), + mat_out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -546,7 +575,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descriptor.descriptor(), + mat_out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -568,7 +597,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descriptor.descriptor(), + mat_out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -581,7 +610,7 @@ void SparseBlas::SPGEMM(bool transa, dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpMatGetSize( - out_descriptor.descriptor(), &C_num_rows1, &C_num_cols1, &C_nnz1); + mat_out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); }); VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; @@ -593,7 +622,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descriptor.descriptor(), + mat_out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc); @@ -601,7 +630,7 @@ void SparseBlas::SPGEMM(bool transa, dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpMatGetSize( - out_descriptor.descriptor(), &C_num_rows1, &C_num_cols1, &C_nnz1); + mat_out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); }); VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; } diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 553f466b05b2e..222b57b108247 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -150,7 +150,14 @@ void MatmulKernelImpl(const Context& dev_ctx, // meta_out.set_dtype(x.dtype()); // dev_ctx.template Alloc(out); - VLOG(0) << "tyep " << out->crows().dtype(); + // VLOG(0) << "tyep " << out->crows().dtype(); + // EmptyLikeCsrKernel(dev_ctx, x, out); + + // VLOG(0) << "tyep " << out->crows().dtype(); + // *(out->mutable_crows()) = x.crows(); + // *(out->mutable_cols()) = DenseTensor(phi::DataType::INT64); + // *(out->mutable_values()) = DenseTensor(phi::DataType::INT64); + // *(out->mutable_crows()) = x.crows(); // *(out->mutable_cols()) = x.cols(); diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index d3efa2056673c..8fb34270d7313 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -95,8 +95,12 @@ class TestMatmul2(unittest.TestCase): def check_result(self, x_shape, y_shape): # origin_x = paddle.rand(x_shape) # origin_y = paddle.rand(y_shape) - origin_x = paddle.to_tensor([[1, -1], [0, 1]]) - origin_y = paddle.to_tensor([[1, 1], [1, 1]]) + origin_x = paddle.to_tensor( + [[1.0, -1.0, 1.0], [1.0, 1.0, 1.0]], dtype="float32" + ) + origin_y = paddle.to_tensor( + [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype="float32" + ) dense_x = origin_x.detach() dense_x.stop_gradient = False @@ -110,6 +114,7 @@ def check_result(self, x_shape, y_shape): sp_y = origin_y.detach().to_sparse_csr() sp_x.stop_gradient = False sp_y.stop_gradient = False + print(sp_x) sp_out = paddle.sparse.matmul(sp_x, sp_y) print(sp_out) From 96c9cd59c241ed4b78d17f38a41b3c70c9abfb9b Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Mon, 11 Dec 2023 07:13:57 +0000 Subject: [PATCH 04/25] fix --- .../funcs/sparse/sparse_blas_impl.cu.h | 202 +++++++++-- .../funcs/sparse/sparse_blas_impl.hip.h | 76 ---- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 32 +- test/legacy_test/test_sparse_matmul_op.py | 329 +++++++++--------- 4 files changed, 335 insertions(+), 304 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 1f66581490119..1e30a17bf1271 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -26,6 +26,7 @@ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/empty_kernel.h" namespace phi { namespace funcs { @@ -42,6 +43,15 @@ cudaDataType_t GetGpuDataType() { } } +template +cusparseIndexType_t GetCusparseIndexType() { + if (std::is_same::value) { + return CUSPARSE_INDEX_32I; + } else if (std::is_same::value) { + return CUSPARSE_INDEX_64I; + } +} + inline cusparseOperation_t GetTransposeOperation(const bool trans) { if (trans) { return CUSPARSE_OPERATION_TRANSPOSE; @@ -86,9 +96,9 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, const IntT* crows_data = x.non_zero_crows().data(); const IntT* cols_data = x.non_zero_cols().data(); const T* values_data = x.non_zero_elements().data(); - int64_t batch_nnz = x.nnz() / batch_size; cudaDataType_t gpu_type = GetGpuDataType(); + cusparseIndexType_t index_type = GetCusparseIndexType(); dev_ctx.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseCreateCsr(descriptor, M, @@ -97,8 +107,8 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, const_cast(crows_data), const_cast(cols_data), const_cast(values_data), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, + index_type, + index_type, CUSPARSE_INDEX_BASE_ZERO, gpu_type); }); @@ -143,6 +153,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, int64_t batch_nnz = nnz / batch_size; cudaDataType_t gpu_type = GetGpuDataType(); + cusparseIndexType_t index_type = GetCusparseIndexType(); dev_ctx.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseCreateCoo(descriptor, M, @@ -151,7 +162,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, const_cast(rows_data), const_cast(cols_data), const_cast(values_data), - CUSPARSE_INDEX_32I, + index_type, CUSPARSE_INDEX_BASE_ZERO, gpu_type); }); @@ -493,31 +504,28 @@ void SparseBlas::SPGEMM(bool transa, SparseCsrTensor* mat_out) const { auto a_descriptor = CuSparseSpMatDescriptor(mat_a, dev_ctx_); auto b_descriptor = CuSparseSpMatDescriptor(mat_b, dev_ctx_); - - cusparseSpMatDescr_t mat_out_descr; // auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + // VLOG(0) << mat_out->dims(); + + cusparseSpMatDescr_t out_descr; + cudaDataType_t gpu_type = GetGpuDataType(); size_t buffer_a_size = 0, buffer_b_size = 0; cusparseSpGEMMDescr_t spgemmDesc; - std::vector xdim_vec = phi::vectorize(mat_a.dims()); - auto x_ndims = xdim_vec.size(); - std::vector ydim_vec = phi::vectorize(mat_b.dims()); - auto y_ndims = ydim_vec.size(); - int64_t M = xdim_vec[x_ndims - 2]; - int64_t N = ydim_vec[y_ndims - 1]; + std::vector out_dim_vec = phi::vectorize(mat_out->dims()); + auto out_ndims = out_dim_vec.size(); + int64_t M = out_dim_vec[out_ndims - 2]; + int64_t N = out_dim_vec[out_ndims - 1]; + int batch_size = 1; + for (int i = 0; i < out_ndims - 2; i++) { + batch_size *= out_dim_vec[i]; + } phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); - // float* a; - // cudaMalloc((void**)&a, 10 * sizeof(float)); - // float* b; - // cudaMalloc((void**)&b, 10 * sizeof(float)); - // float* c; - // cudaMalloc((void**)&c, 10 * sizeof(float)); - - phi::dynload::cusparseCreateCsr(&mat_out_descr, + phi::dynload::cusparseCreateCsr(&out_descr, M, N, 0, @@ -537,7 +545,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - mat_out_descr, + out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -559,7 +567,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - mat_out_descr, + out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -575,7 +583,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - mat_out_descr, + out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -597,7 +605,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - mat_out_descr, + out_descr, gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -610,10 +618,17 @@ void SparseBlas::SPGEMM(bool transa, dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpMatGetSize( - mat_out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); + out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); }); VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; + DenseTensor out_crows = phi::Empty(dev_ctx_, {M + 1}); + DenseTensor out_cols = phi::Empty(dev_ctx_, {C_nnz1}); + DenseTensor out_values = phi::Empty(dev_ctx_, {C_nnz1}); + mat_out->SetMember(out_crows, out_cols, out_values, mat_out->dims()); + + auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_copy(handle, GetTransposeOperation(transa), @@ -622,7 +637,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - mat_out_descr, + out_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc); @@ -630,10 +645,143 @@ void SparseBlas::SPGEMM(bool transa, dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpMatGetSize( - mat_out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); + out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); }); VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; } + +// template <> +// template +// void SparseBlas::SPGEMM(bool transa, +// bool transb, +// T alpha0, +// const SparseCsrTensor& mat_a, +// const SparseCsrTensor& mat_b, +// T beta0, +// SparseCsrTensor* mat_out) const { +// #define A_NUM_ROWS 4 // C compatibility +// const int A_num_rows = 4; +// const int A_num_cols = 4; +// const int A_nnz = 9; +// const int B_num_rows = 4; +// const int B_num_cols = 4; +// const int B_nnz = 9; +// int hA_csrOffsets[] = { 0, 3, 4, 7, 9 }; +// int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 }; +// float hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, +// 6.0f, 7.0f, 8.0f, 9.0f }; +// int hB_csrOffsets[] = { 0, 2, 4, 7, 8 }; +// int hB_columns[] = { 0, 3, 1, 3, 0, 1, 2, 1 }; +// float hB_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, +// 6.0f, 7.0f, 8.0f }; +// int hC_csrOffsets[] = { 0, 4, 6, 10, 12 }; +// int hC_columns[] = { 0, 1, 2, 3, 1, 3, 0, 1, 2, 3, 1, 3 }; +// float hC_values[] = { 11.0f, 36.0f, 14.0f, 2.0f, 12.0f, +// 16.0f, 35.0f, 92.0f, 42.0f, 10.0f, +// 96.0f, 32.0f }; +// const int C_nnz = 12; +// #define C_NUM_NNZ 12 // C compatibility +// float alpha = 1.0f; +// float beta = 0.0f; +// cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; +// cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; +// cudaDataType computeType = CUDA_R_32F; +// //-------------------------------------------------------------------------- +// // Device memory management: Allocate and copy A, B +// int *dA_csrOffsets, *dA_columns, *dB_csrOffsets, *dB_columns, +// *dC_csrOffsets, *dC_columns; +// float *dA_values, *dB_values, *dC_values; +// // allocate A +// cudaMalloc((void**) &dA_csrOffsets, +// (A_num_rows + 1) * sizeof(int)); +// cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int)); +// cudaMalloc((void**) &dA_values, A_nnz * sizeof(float)); +// // allocate B +// cudaMalloc((void**) &dB_csrOffsets, +// (B_num_rows + 1) * sizeof(int)); +// cudaMalloc((void**) &dB_columns, B_nnz * sizeof(int)); +// cudaMalloc((void**) &dB_values, B_nnz * sizeof(float)); +// // allocate C offsets +// cudaMalloc((void**) &dC_csrOffsets, +// (A_num_rows + 1) * sizeof(int)); + +// // copy A +// cudaMemcpy(dA_csrOffsets, hA_csrOffsets, +// (A_num_rows + 1) * sizeof(int), +// cudaMemcpyHostToDevice); +// cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int), +// cudaMemcpyHostToDevice); +// cudaMemcpy(dA_values, hA_values, +// A_nnz * sizeof(float), cudaMemcpyHostToDevice); +// // copy B +// cudaMemcpy(dB_csrOffsets, hB_csrOffsets, +// (B_num_rows + 1) * sizeof(int), +// cudaMemcpyHostToDevice); +// cudaMemcpy(dB_columns, hB_columns, B_nnz * sizeof(int), +// cudaMemcpyHostToDevice); +// cudaMemcpy(dB_values, hB_values, +// B_nnz * sizeof(float), cudaMemcpyHostToDevice); +// //-------------------------------------------------------------------------- +// // CUSPARSE APIs +// cusparseHandle_t handle = nullptr; +// cusparseSpMatDescr_t matA, matB, matC; +// void* dBuffer1 = nullptr, *dBuffer2 = nullptr; +// size_t bufferSize1 = 0, bufferSize2 = 0; +// phi::dynload::cusparseCreate(&handle); +// // Create sparse matrix A in CSR format +// phi::dynload::cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz, +// dA_csrOffsets, dA_columns, dA_values, +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); +// phi::dynload::cusparseCreateCsr(&matB, B_num_rows, B_num_cols, B_nnz, +// dB_csrOffsets, dB_columns, dB_values, +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); +// phi::dynload::cusparseCreateCsr(&matC, A_num_rows, B_num_cols, 0, +// nullptr, nullptr, nullptr, +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); +// //-------------------------------------------------------------------------- +// // SpGEMM Computation +// cusparseSpGEMMDescr_t spgemmDesc; +// phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); + +// // ask bufferSize1 bytes for external memory + +// phi::dynload::cusparseSpGEMM_workEstimation(handle, opA, opB, +// &alpha, matA, matB, &beta, matC, +// computeType, CUSPARSE_SPGEMM_DEFAULT, +// spgemmDesc, &bufferSize1, nullptr); +// cudaMalloc((void**) &dBuffer1, bufferSize1); +// // inspect the matrices A and B to understand the memory requirement for +// // the next step + +// phi::dynload::cusparseSpGEMM_workEstimation(handle, opA, opB, +// &alpha, matA, matB, &beta, matC, +// computeType, CUSPARSE_SPGEMM_DEFAULT, +// spgemmDesc, &bufferSize1, dBuffer1); + +// // ask bufferSize2 bytes for external memory + +// phi::dynload::cusparseSpGEMM_compute(handle, opA, opB, +// &alpha, matA, matB, &beta, matC, +// computeType, CUSPARSE_SPGEMM_DEFAULT, +// spgemmDesc, &bufferSize2, nullptr); +// cudaMalloc((void**) &dBuffer2, bufferSize2); + +// // compute the intermediate product of A * B +// phi::dynload::cusparseSpGEMM_compute(handle, opA, opB, +// &alpha, matA, matB, &beta, matC, +// computeType, +// CUSPARSE_SPGEMM_DEFAULT, +// spgemmDesc, &bufferSize2, +// dBuffer2); +// // get matrix C non-zero entries C_nnz1 +// int64_t C_num_rows1, C_num_cols1, C_nnz1; +// phi::dynload::cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, +// &C_nnz1); +// VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; +// } } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h index e2b5c9dfdc02b..cbd42be3cb6d4 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h @@ -400,82 +400,6 @@ void SparseBlas::SDDMM(bool transa, }); } #endif - -/************* SPARSE*SPARSE->SPARSE MATMUL ************/ -template <> -template -void SparseBlas::SPGEMM(bool transa, - bool transb, - T alpha, - const SparseCsrTensor& mat_a, - const SparseCsrTensor& mat_b, - T beta, - SparseCsrTensor* mat_out) const { - // auto a_descriptor = RocSparseSpMatDescriptor(mat_a, dev_ctx_); - // auto b_descriptor = RocSparseSpMatDescriptor(mat_b, dev_ctx_); - // auto out_descriptor = RocSparseSpMatDescriptor(*mat_out, dev_ctx_); - - // rocsparse_datatype ttype = GetGpuDataType(); - // size_t buffer_size = 0; - - // // Query SpMM buffer - // dev_ctx_.CusparseCall([&](rocsparse_handle handle) { - // phi::dynload::rocsparse_spmm(handle, - // GetTransposeOperation(transa), - // GetTransposeOperation(transb), - // &alpha, - // a_descriptor.descriptor(), - // b_descriptor.descriptor(), - // &beta, - // out_descriptor.descriptor(), - // ttype, - // GetSpMMAlgorithm(mat_a), - // rocsparse_spmm_stage_buffer_size, - // &buffer_size, - // nullptr); - // }); - - // // Allocate buffer - // phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc( - // dev_ctx_.GetPlace(), - // buffer_size, - // phi::Stream(reinterpret_cast(dev_ctx_.stream()))); - // void* tmp_buffer_ptr = tmp_buffer->ptr(); - - // // Preprocess data - // dev_ctx_.CusparseCall([&](rocsparse_handle handle) { - // phi::dynload::rocsparse_spmm(handle, - // GetTransposeOperation(transa), - // GetTransposeOperation(transb), - // &alpha, - // a_descriptor.descriptor(), - // b_descriptor.descriptor(), - // &beta, - // out_descriptor.descriptor(), - // ttype, - // GetSpMMAlgorithm(mat_a), - // rocsparse_spmm_stage_preprocess, - // &buffer_size, - // tmp_buffer_ptr); - // }); - - // // Performs the actual SpMM computation - // dev_ctx_.CusparseCall([&](rocsparse_handle handle) { - // phi::dynload::rocsparse_spmm(handle, - // GetTransposeOperation(transa), - // GetTransposeOperation(transb), - // &alpha, - // a_descriptor.descriptor(), - // b_descriptor.descriptor(), - // &beta, - // out_descriptor.descriptor(), - // ttype, - // GetSpMMAlgorithm(mat_a), - // rocsparse_spmm_stage_compute, - // &buffer_size, - // tmp_buffer_ptr); - // }); -} } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 222b57b108247..76bfe8efce2c1 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/matmul_kernel.h" #include +#include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/ddim.h" @@ -140,37 +141,6 @@ void MatmulKernelImpl(const Context& dev_ctx, phi::errors::PreconditionNotMet( "The shape of Input(x) and Input(y) is not suitable for matmul " "opetation, x_dim[-1] must be eaqual to y_dim[-2].")); - // InferMeta of 'out' - // std::vector out_dim_vec(ydim_vec); - // out_dim_vec[y_ndims - 2] = xdim_vec[x_ndims - 2]; - // out_dim_vec[y_ndims - 1] = ydim_vec[y_ndims - 1]; - // MetaTensor meta_out(out); - - // meta_out.set_dims(phi::make_ddim(out_dim_vec)); - // meta_out.set_dtype(x.dtype()); - - // dev_ctx.template Alloc(out); - // VLOG(0) << "tyep " << out->crows().dtype(); - // EmptyLikeCsrKernel(dev_ctx, x, out); - - // VLOG(0) << "tyep " << out->crows().dtype(); - // *(out->mutable_crows()) = x.crows(); - // *(out->mutable_cols()) = DenseTensor(phi::DataType::INT64); - // *(out->mutable_values()) = DenseTensor(phi::DataType::INT64); - - // *(out->mutable_crows()) = x.crows(); - // *(out->mutable_cols()) = x.cols(); - - // const DenseTensor& x_values = x.values(); - // DenseTensor* out_values = out->mutable_values(); - // out_values->Resize(x_values.dims()); - // out->set_meta(x.meta()); - // dev_ctx.template Alloc(out_values); - -#ifdef PADDLE_WITH_HIP - phi::funcs::SetConstant set_zero; - // set_zero(dev_ctx, out, static_cast(0.0f)); -#endif auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); sparse_blas.SPGEMM( diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index 8fb34270d7313..a3dca531db7b3 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -16,6 +16,9 @@ import re import unittest +import numpy as np +import scipy.sparse as sp + import paddle paddle.set_default_dtype('float64') @@ -33,197 +36,183 @@ def get_cuda_version(): return -1 -# class TestMatmul(unittest.TestCase): -# # x: sparse, y: dense, out: dense -# def check_result(self, x_shape, y_shape, format): -# if len(x_shape) == 3: -# mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]]) -# else: -# mask = paddle.randint(0, 2, x_shape) -# origin_x = paddle.rand(x_shape) * mask -# origin_y = paddle.rand(y_shape) - -# dense_x = origin_x.detach() -# dense_x.stop_gradient = False -# dense_y = origin_y.detach() -# dense_y.stop_gradient = False -# dense_out = paddle.matmul(dense_x, dense_y) - -# if format == "coo": -# sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) -# else: -# sp_x = origin_x.detach().to_sparse_csr() -# sp_x.stop_gradient = False -# sp_y = origin_y.detach() -# sp_y.stop_gradient = False -# sp_out = paddle.sparse.matmul(sp_x, sp_y) - -# np.testing.assert_allclose( -# sp_out.numpy(), dense_out.numpy(), rtol=1e-05 -# ) -# if get_cuda_version() >= 11030: -# dense_out.backward() -# sp_out.backward() -# np.testing.assert_allclose( -# sp_x.grad.to_dense().numpy(), -# (dense_x.grad * mask).numpy(), -# rtol=1e-05, -# ) -# np.testing.assert_allclose( -# sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 -# ) - -# @unittest.skipIf( -# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, -# "only support cuda>=11.0", -# ) -# def test_matmul_2d(self): -# self.check_result([16, 12], [12, 10], 'coo') -# self.check_result([16, 12], [12, 10], 'csr') - -# @unittest.skipIf( -# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, -# "only support cuda>=11.8", -# ) -# def test_matmul_3d(self): -# self.check_result([8, 16, 12], [8, 12, 10], 'coo') -# self.check_result([8, 16, 12], [8, 12, 10], 'csr') +class TestMatmul(unittest.TestCase): + # x: sparse, y: dense, out: dense + def check_result(self, x_shape, y_shape, format): + if len(x_shape) == 3: + mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]]) + else: + mask = paddle.randint(0, 2, x_shape) + origin_x = paddle.rand(x_shape) * mask + origin_y = paddle.rand(y_shape) + + dense_x = origin_x.detach() + dense_x.stop_gradient = False + dense_y = origin_y.detach() + dense_y.stop_gradient = False + dense_out = paddle.matmul(dense_x, dense_y) + + if format == "coo": + sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) + else: + sp_x = origin_x.detach().to_sparse_csr() + sp_x.stop_gradient = False + sp_y = origin_y.detach() + sp_y.stop_gradient = False + sp_out = paddle.sparse.matmul(sp_x, sp_y) + + np.testing.assert_allclose( + sp_out.numpy(), dense_out.numpy(), rtol=1e-05 + ) + if get_cuda_version() >= 11030: + dense_out.backward() + sp_out.backward() + np.testing.assert_allclose( + sp_x.grad.to_dense().numpy(), + (dense_x.grad * mask).numpy(), + rtol=1e-05, + ) + np.testing.assert_allclose( + sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 + ) + + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, + "only support cuda>=11.0", + ) + def test_matmul_2d(self): + self.check_result([16, 12], [12, 10], 'coo') + self.check_result([16, 12], [12, 10], 'csr') + + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, + "only support cuda>=11.8", + ) + def test_matmul_3d(self): + self.check_result([8, 16, 12], [8, 12, 10], 'coo') + self.check_result([8, 16, 12], [8, 12, 10], 'csr') class TestMatmul2(unittest.TestCase): - # x: sparse, y: dense, out: dense + # x: csr sparse, y: csr sparse, out: csr sparse def check_result(self, x_shape, y_shape): - # origin_x = paddle.rand(x_shape) - # origin_y = paddle.rand(y_shape) - origin_x = paddle.to_tensor( - [[1.0, -1.0, 1.0], [1.0, 1.0, 1.0]], dtype="float32" - ) - origin_y = paddle.to_tensor( - [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype="float32" - ) + origin_x = paddle.rand(x_shape) + origin_y = paddle.rand(y_shape) dense_x = origin_x.detach() dense_x.stop_gradient = False dense_y = origin_y.detach() dense_y.stop_gradient = False dense_out = paddle.matmul(dense_x, dense_y) - print(dense_x) - print(dense_y) - print(dense_out) + sp_x = origin_x.detach().to_sparse_csr() + # only support 32-bit index. + sp_x_crows = paddle.cast(sp_x.crows(), "int32") + sp_x_cols = paddle.cast(sp_x.cols(), "int32") + sp_x = paddle.sparse.sparse_csr_tensor( + sp_x_crows, sp_x_cols, sp_x.values(), sp_x.shape + ) + sp_y = origin_y.detach().to_sparse_csr() + # only support 32-bit index. + sp_y_crows = paddle.cast(sp_y.crows(), "int32") + sp_y_cols = paddle.cast(sp_y.cols(), "int32") + sp_y = paddle.sparse.sparse_csr_tensor( + sp_y_crows, sp_y_cols, sp_y.values(), sp_y.shape + ) + sp_x.stop_gradient = False sp_y.stop_gradient = False - print(sp_x) + sp_out = paddle.sparse.matmul(sp_x, sp_y) - print(sp_out) - - # np.testing.assert_allclose( - # sp_out.numpy(), dense_out.numpy(), rtol=1e-05 - # ) - # if get_cuda_version() >= 11030: - # dense_out.backward() - # sp_out.backward() - # np.testing.assert_allclose( - # sp_x.grad.to_dense().numpy(), - # (dense_x.grad * mask).numpy(), - # rtol=1e-05, - # ) - # np.testing.assert_allclose( - # sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 - # ) + + np.testing.assert_allclose( + sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 + ) @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, "only support cuda>=11.0", ) def test_matmul_2d(self): - self.check_result([3, 4], [4, 5]) - - # @unittest.skipIf( - # not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, - # "only support cuda>=11.8", - # ) - # def test_matmul_3d(self): - # self.check_result([8, 16, 12], [8, 12, 10], 'coo') - # self.check_result([8, 16, 12], [8, 12, 10], 'csr') - - -# class TestMaskedMatmul(unittest.TestCase): -# # x: dense, y: dense, out: sparse_`csr -# @unittest.skipIf( -# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030, -# "only support on cuda>=11.3", -# ) -# def test_masked_matmul_2d(self): -# np_mask = np.random.rand(10, 6) < 0.2 - -# np_x = np.random.rand(10, 12) -# np_y = np.random.rand(12, 6) -# np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask) - -# np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask) -# # dx(dense) = dout(csr) * y'(dense) -# np_x_grad = np_out_grad @ np_y.transpose(1, 0) -# # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense) -# np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0) - -# x = paddle.to_tensor(np_x, stop_gradient=False) -# y = paddle.to_tensor(np_y, stop_gradient=False) -# mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr() -# out = paddle.sparse.masked_matmul(x, y, mask) - -# np.testing.assert_allclose( -# np_out.indptr, out.crows().numpy(), rtol=1e-05 -# ) -# np.testing.assert_allclose( -# np_out.indices, out.cols().numpy(), rtol=1e-05 -# ) -# np.testing.assert_allclose( -# np_out.data, out.values().numpy(), rtol=1e-05 -# ) - -# out.backward() -# np.testing.assert_allclose(out.is_sparse_csr(), True, rtol=1e-05) -# np.testing.assert_allclose(np_x_grad, x.grad.numpy(), rtol=1e-05) -# np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05) - -# @unittest.skipIf( -# not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, -# "only support on cuda>=11.8", -# ) -# def test_masked_matmul_3d(self): -# paddle.set_default_dtype('float32') -# origin_x = paddle.rand([16, 16, 12]) -# mask = paddle.randint(0, 2, [16, 12]) -# origin_x = origin_x * mask -# origin_y = paddle.rand([16, 12, 10]) - -# dense_x = origin_x.detach() -# dense_x.stop_gradient = False -# dense_y = origin_y.detach() -# dense_y.stop_gradient = False -# dense_out = paddle.matmul(dense_x, dense_y) -# dense_out.backward() - -# sp_x = origin_x.detach().to_sparse_csr() -# sp_x.stop_gradient = False -# sp_y = origin_y.detach() -# sp_y.stop_gradient = False -# sp_out = paddle.sparse.matmul(sp_x, sp_y) -# sp_out.backward() - -# np.testing.assert_allclose( -# sp_out.numpy(), dense_out.numpy(), rtol=1e-05 -# ) -# np.testing.assert_allclose( -# sp_x.grad.to_dense().numpy(), -# (dense_x.grad * mask).numpy(), -# rtol=1e-05, -# ) -# np.testing.assert_allclose( -# sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 -# ) + self.check_result([16, 12], [12, 10]) + + +class TestMaskedMatmul(unittest.TestCase): + # x: dense, y: dense, out: sparse_`csr + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030, + "only support on cuda>=11.3", + ) + def test_masked_matmul_2d(self): + np_mask = np.random.rand(10, 6) < 0.2 + + np_x = np.random.rand(10, 12) + np_y = np.random.rand(12, 6) + np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask) + + np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask) + # dx(dense) = dout(csr) * y'(dense) + np_x_grad = np_out_grad @ np_y.transpose(1, 0) + # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense) + np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0) + + x = paddle.to_tensor(np_x, stop_gradient=False) + y = paddle.to_tensor(np_y, stop_gradient=False) + mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr() + out = paddle.sparse.masked_matmul(x, y, mask) + + np.testing.assert_allclose( + np_out.indptr, out.crows().numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + np_out.indices, out.cols().numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + np_out.data, out.values().numpy(), rtol=1e-05 + ) + + out.backward() + np.testing.assert_allclose(out.is_sparse_csr(), True, rtol=1e-05) + np.testing.assert_allclose(np_x_grad, x.grad.numpy(), rtol=1e-05) + np.testing.assert_allclose(np_y_grad, y.grad.numpy(), rtol=1e-05) + + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, + "only support on cuda>=11.8", + ) + def test_masked_matmul_3d(self): + paddle.set_default_dtype('float32') + origin_x = paddle.rand([16, 16, 12]) + mask = paddle.randint(0, 2, [16, 12]) + origin_x = origin_x * mask + origin_y = paddle.rand([16, 12, 10]) + + dense_x = origin_x.detach() + dense_x.stop_gradient = False + dense_y = origin_y.detach() + dense_y.stop_gradient = False + dense_out = paddle.matmul(dense_x, dense_y) + dense_out.backward() + + sp_x = origin_x.detach().to_sparse_csr() + sp_x.stop_gradient = False + sp_y = origin_y.detach() + sp_y.stop_gradient = False + sp_out = paddle.sparse.matmul(sp_x, sp_y) + sp_out.backward() + + np.testing.assert_allclose( + sp_out.numpy(), dense_out.numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + sp_x.grad.to_dense().numpy(), + (dense_x.grad * mask).numpy(), + rtol=1e-05, + ) + np.testing.assert_allclose( + sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 + ) if __name__ == "__main__": From 044ba38357c69dba1ff1d14499adb0e77c791526 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 12 Dec 2023 05:29:41 +0000 Subject: [PATCH 05/25] coo*coo --- .../funcs/sparse/sparse_blas_impl.cu.h | 166 +----------------- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 36 ++-- test/legacy_test/test_sparse_matmul_op.py | 58 +++++- 3 files changed, 85 insertions(+), 175 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 1e30a17bf1271..440f570de1371 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -504,27 +504,16 @@ void SparseBlas::SPGEMM(bool transa, SparseCsrTensor* mat_out) const { auto a_descriptor = CuSparseSpMatDescriptor(mat_a, dev_ctx_); auto b_descriptor = CuSparseSpMatDescriptor(mat_b, dev_ctx_); - // auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); - - // VLOG(0) << mat_out->dims(); - - cusparseSpMatDescr_t out_descr; cudaDataType_t gpu_type = GetGpuDataType(); size_t buffer_a_size = 0, buffer_b_size = 0; - cusparseSpGEMMDescr_t spgemmDesc; std::vector out_dim_vec = phi::vectorize(mat_out->dims()); auto out_ndims = out_dim_vec.size(); int64_t M = out_dim_vec[out_ndims - 2]; int64_t N = out_dim_vec[out_ndims - 1]; - int batch_size = 1; - for (int i = 0; i < out_ndims - 2; i++) { - batch_size *= out_dim_vec[i]; - } - - phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); + cusparseSpMatDescr_t out_descr; phi::dynload::cusparseCreateCsr(&out_descr, M, N, @@ -537,6 +526,8 @@ void SparseBlas::SPGEMM(bool transa, CUSPARSE_INDEX_BASE_ZERO, gpu_type); + cusparseSpGEMMDescr_t spgemmDesc; + phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_workEstimation(handle, GetTransposeOperation(transa), @@ -613,18 +604,14 @@ void SparseBlas::SPGEMM(bool transa, tmp_buffer_b_ptr); }); - // get matrix C non-zero entries C_nnz1 - int64_t C_num_rows1, C_num_cols1, C_nnz1; - + int64_t num_rows, num_cols1, nnz; dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpMatGetSize( - out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); + phi::dynload::cusparseSpMatGetSize(out_descr, &num_rows, &num_cols1, &nnz); }); - VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; DenseTensor out_crows = phi::Empty(dev_ctx_, {M + 1}); - DenseTensor out_cols = phi::Empty(dev_ctx_, {C_nnz1}); - DenseTensor out_values = phi::Empty(dev_ctx_, {C_nnz1}); + DenseTensor out_cols = phi::Empty(dev_ctx_, {nnz}); + DenseTensor out_values = phi::Empty(dev_ctx_, {nnz}); mat_out->SetMember(out_crows, out_cols, out_values, mat_out->dims()); auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); @@ -642,146 +629,7 @@ void SparseBlas::SPGEMM(bool transa, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc); }); - - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpMatGetSize( - out_descr, &C_num_rows1, &C_num_cols1, &C_nnz1); - }); - VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; } - -// template <> -// template -// void SparseBlas::SPGEMM(bool transa, -// bool transb, -// T alpha0, -// const SparseCsrTensor& mat_a, -// const SparseCsrTensor& mat_b, -// T beta0, -// SparseCsrTensor* mat_out) const { -// #define A_NUM_ROWS 4 // C compatibility -// const int A_num_rows = 4; -// const int A_num_cols = 4; -// const int A_nnz = 9; -// const int B_num_rows = 4; -// const int B_num_cols = 4; -// const int B_nnz = 9; -// int hA_csrOffsets[] = { 0, 3, 4, 7, 9 }; -// int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 }; -// float hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, -// 6.0f, 7.0f, 8.0f, 9.0f }; -// int hB_csrOffsets[] = { 0, 2, 4, 7, 8 }; -// int hB_columns[] = { 0, 3, 1, 3, 0, 1, 2, 1 }; -// float hB_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, -// 6.0f, 7.0f, 8.0f }; -// int hC_csrOffsets[] = { 0, 4, 6, 10, 12 }; -// int hC_columns[] = { 0, 1, 2, 3, 1, 3, 0, 1, 2, 3, 1, 3 }; -// float hC_values[] = { 11.0f, 36.0f, 14.0f, 2.0f, 12.0f, -// 16.0f, 35.0f, 92.0f, 42.0f, 10.0f, -// 96.0f, 32.0f }; -// const int C_nnz = 12; -// #define C_NUM_NNZ 12 // C compatibility -// float alpha = 1.0f; -// float beta = 0.0f; -// cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; -// cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; -// cudaDataType computeType = CUDA_R_32F; -// //-------------------------------------------------------------------------- -// // Device memory management: Allocate and copy A, B -// int *dA_csrOffsets, *dA_columns, *dB_csrOffsets, *dB_columns, -// *dC_csrOffsets, *dC_columns; -// float *dA_values, *dB_values, *dC_values; -// // allocate A -// cudaMalloc((void**) &dA_csrOffsets, -// (A_num_rows + 1) * sizeof(int)); -// cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int)); -// cudaMalloc((void**) &dA_values, A_nnz * sizeof(float)); -// // allocate B -// cudaMalloc((void**) &dB_csrOffsets, -// (B_num_rows + 1) * sizeof(int)); -// cudaMalloc((void**) &dB_columns, B_nnz * sizeof(int)); -// cudaMalloc((void**) &dB_values, B_nnz * sizeof(float)); -// // allocate C offsets -// cudaMalloc((void**) &dC_csrOffsets, -// (A_num_rows + 1) * sizeof(int)); - -// // copy A -// cudaMemcpy(dA_csrOffsets, hA_csrOffsets, -// (A_num_rows + 1) * sizeof(int), -// cudaMemcpyHostToDevice); -// cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int), -// cudaMemcpyHostToDevice); -// cudaMemcpy(dA_values, hA_values, -// A_nnz * sizeof(float), cudaMemcpyHostToDevice); -// // copy B -// cudaMemcpy(dB_csrOffsets, hB_csrOffsets, -// (B_num_rows + 1) * sizeof(int), -// cudaMemcpyHostToDevice); -// cudaMemcpy(dB_columns, hB_columns, B_nnz * sizeof(int), -// cudaMemcpyHostToDevice); -// cudaMemcpy(dB_values, hB_values, -// B_nnz * sizeof(float), cudaMemcpyHostToDevice); -// //-------------------------------------------------------------------------- -// // CUSPARSE APIs -// cusparseHandle_t handle = nullptr; -// cusparseSpMatDescr_t matA, matB, matC; -// void* dBuffer1 = nullptr, *dBuffer2 = nullptr; -// size_t bufferSize1 = 0, bufferSize2 = 0; -// phi::dynload::cusparseCreate(&handle); -// // Create sparse matrix A in CSR format -// phi::dynload::cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz, -// dA_csrOffsets, dA_columns, dA_values, -// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, -// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); -// phi::dynload::cusparseCreateCsr(&matB, B_num_rows, B_num_cols, B_nnz, -// dB_csrOffsets, dB_columns, dB_values, -// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, -// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); -// phi::dynload::cusparseCreateCsr(&matC, A_num_rows, B_num_cols, 0, -// nullptr, nullptr, nullptr, -// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, -// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); -// //-------------------------------------------------------------------------- -// // SpGEMM Computation -// cusparseSpGEMMDescr_t spgemmDesc; -// phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); - -// // ask bufferSize1 bytes for external memory - -// phi::dynload::cusparseSpGEMM_workEstimation(handle, opA, opB, -// &alpha, matA, matB, &beta, matC, -// computeType, CUSPARSE_SPGEMM_DEFAULT, -// spgemmDesc, &bufferSize1, nullptr); -// cudaMalloc((void**) &dBuffer1, bufferSize1); -// // inspect the matrices A and B to understand the memory requirement for -// // the next step - -// phi::dynload::cusparseSpGEMM_workEstimation(handle, opA, opB, -// &alpha, matA, matB, &beta, matC, -// computeType, CUSPARSE_SPGEMM_DEFAULT, -// spgemmDesc, &bufferSize1, dBuffer1); - -// // ask bufferSize2 bytes for external memory - -// phi::dynload::cusparseSpGEMM_compute(handle, opA, opB, -// &alpha, matA, matB, &beta, matC, -// computeType, CUSPARSE_SPGEMM_DEFAULT, -// spgemmDesc, &bufferSize2, nullptr); -// cudaMalloc((void**) &dBuffer2, bufferSize2); - -// // compute the intermediate product of A * B -// phi::dynload::cusparseSpGEMM_compute(handle, opA, opB, -// &alpha, matA, matB, &beta, matC, -// computeType, -// CUSPARSE_SPGEMM_DEFAULT, -// spgemmDesc, &bufferSize2, -// dBuffer2); -// // get matrix C non-zero entries C_nnz1 -// int64_t C_num_rows1, C_num_cols1, C_nnz1; -// phi::dynload::cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, -// &C_nnz1); -// VLOG(0) << C_num_rows1 << " " << C_num_cols1 << " " << C_nnz1; -// } } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 76bfe8efce2c1..7ce3bc26b7dad 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function_impl.h" #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" namespace phi { namespace sparse { @@ -108,7 +109,7 @@ void MatmulKernelImpl(const Context& dev_ctx, const SparseCsrTensor& x, const SparseCsrTensor& y, SparseCsrTensor* out) { -#if CUDA_VERSION >= 11000 || HIP_VERSION >= 402 +#if CUDA_VERSION >= 11000 std::vector xdim_vec = phi::vectorize(x.dims()); std::vector ydim_vec = phi::vectorize(y.dims()); auto x_ndims = xdim_vec.size(); @@ -147,13 +148,12 @@ void MatmulKernelImpl(const Context& dev_ctx, false, false, static_cast(1), x, y, static_cast(0), out); #else #ifdef PADDLE_WITH_CUDA - PADDLE_THROW( - phi::errors::Unimplemented("forward of 'sparse.matmul' use cusparseSpMM, " - "which is supported from CUDA 11.0")); -#elif defined(PADDLE_WITH_HIP) PADDLE_THROW(phi::errors::Unimplemented( - "forward of 'sparse.matmul' use rocsparse_spmm, " - "which is supported from ROCM 4.2.0")); + "forward of 'sparse.matmul' use cusparseSpGEMM, " + "which is supported from CUDA 11.0")); +#elif defined(PADDLE_WITH_HIP) + PADDLE_THROW( + phi::errors::Unimplemented("'sparse.matmul' for HIP is not implemented")); #endif #endif } @@ -174,14 +174,6 @@ void MatmulCsrDenseKernel(const Context& dev_ctx, MatmulKernelImpl(dev_ctx, x, y, out); } -template -void MatmulCooCooKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const SparseCooTensor& y, - SparseCooTensor* out) { - // MatmulKernelImpl(dev_ctx, x, y, out); -} - template void MatmulCsrCsrKernel(const Context& dev_ctx, const SparseCsrTensor& x, @@ -190,6 +182,20 @@ void MatmulCsrCsrKernel(const Context& dev_ctx, MatmulKernelImpl(dev_ctx, x, y, out); } +template +void MatmulCooCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const SparseCooTensor& y, + SparseCooTensor* out) { + // 'cusparseSPGEMM' only support CSR now, so use COO->CSR->COO, + SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); + SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); + SparseCsrTensor out_csr; + out_csr.set_dims(out->dims()); + MatmulKernelImpl(dev_ctx, x_csr, y_csr, &out_csr); + CsrToCooKernel(dev_ctx, out_csr, out); +} + template void MaskedMatmulCsrKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index a3dca531db7b3..d4d32cf17cd56 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -93,7 +93,7 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmul2(unittest.TestCase): +class TestMatmulCSR(unittest.TestCase): # x: csr sparse, y: csr sparse, out: csr sparse def check_result(self, x_shape, y_shape): origin_x = paddle.rand(x_shape) @@ -129,6 +129,62 @@ def check_result(self, x_shape, y_shape): np.testing.assert_allclose( sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 ) + # if get_cuda_version() >= 11030: + # dense_out.backward() + # sp_out.backward() + # print(sp_x.grad) + # np.testing.assert_allclose( + # sp_x.grad.to_dense().numpy(), + # dense_x.grad.numpy(), + # rtol=1e-05, + # ) + # np.testing.assert_allclose( + # sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 + # ) + + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, + "only support cuda>=11.0", + ) + def test_matmul_2d(self): + self.check_result([16, 12], [12, 10]) + self.check_result([8, 16, 12], [8, 12, 10]) + + +class TestMatmulCOO(unittest.TestCase): + # x: csr sparse, y: csr sparse, out: csr sparse + def check_result(self, x_shape, y_shape): + origin_x = paddle.rand(x_shape) + origin_y = paddle.rand(y_shape) + + dense_x = origin_x.detach() + dense_x.stop_gradient = False + dense_y = origin_y.detach() + dense_y.stop_gradient = False + dense_out = paddle.matmul(dense_x, dense_y) + + sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) + + # only support 32-bit index. + sp_x_indices = paddle.cast(sp_x.indices(), "int32") + sp_x = paddle.sparse.sparse_coo_tensor( + sp_x_indices, sp_x.values(), sp_x.shape + ) + + sp_y = origin_y.detach().to_sparse_coo(len(y_shape)) + # only support 32-bit index. + sp_y_indices = paddle.cast(sp_y.indices(), "int32") + sp_y = paddle.sparse.sparse_coo_tensor( + sp_y_indices, sp_y.values(), sp_y.shape + ) + + sp_x.stop_gradient = False + sp_y.stop_gradient = False + + sp_out = paddle.sparse.matmul(sp_x, sp_y) + np.testing.assert_allclose( + sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 + ) @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, From ace8c96e502e673b6ed56abef633907204615fae Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Wed, 13 Dec 2023 11:39:17 +0000 Subject: [PATCH 06/25] add grad op --- paddle/fluid/platform/dynload/cusparse.h | 4 +- paddle/phi/backends/dynload/cusparse.h | 4 +- paddle/phi/kernels/funcs/sparse/sparse_blas.h | 3 + .../funcs/sparse/sparse_blas_impl.cu.h | 117 +++++++++++++----- .../kernels/sparse/gpu/matmul_grad_kernel.cu | 83 +++++++++++++ .../phi/kernels/sparse/gpu/matmul_kernel.cu | 25 +++- paddle/phi/kernels/sparse/matmul_kernel.h | 4 +- test/legacy_test/test_sparse_matmul_op.py | 44 ++++--- 8 files changed, 227 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index cbc09ed47c1bf..abc1ef2c3e9ce 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -55,7 +55,9 @@ namespace dynload { __macro(cusparseSpGEMM_compute); \ __macro(cusparseSpGEMM_workEstimation); \ __macro(cusparseSpGEMM_copy); \ - __macro(cusparseSpGEMM_destroyDescr); + __macro(cusparseSpGEMM_destroyDescr); \ + __macro(cusparseCsr2cscEx2_bufferSize); \ + __macro(cusparseCsr2cscEx2); CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h index 6d90cb5eefdca..974355d10c441 100644 --- a/paddle/phi/backends/dynload/cusparse.h +++ b/paddle/phi/backends/dynload/cusparse.h @@ -67,7 +67,9 @@ extern void *cusparse_dso_handle; __macro(cusparseSpGEMM_compute); \ __macro(cusparseSpGEMM_workEstimation); \ __macro(cusparseSpGEMM_copy); \ - __macro(cusparseSpGEMM_destroyDescr); + __macro(cusparseSpGEMM_destroyDescr); \ + __macro(cusparseCsr2cscEx2_bufferSize); \ + __macro(cusparseCsr2cscEx2); CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h index 87246fa006462..ca6e60842ec93 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h @@ -22,6 +22,9 @@ namespace phi { namespace funcs { namespace sparse { +template +SparseCsrTensor CSRTanspose(const phi::GPUContext& dev_ctx, + const phi::SparseCsrTensor& x); template class SparseBlas { diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 440f570de1371..663f6725c4f47 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -181,6 +181,79 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, } } +template +SparseCsrTensor CSRTanspose(const phi::GPUContext& dev_ctx, + const phi::SparseCsrTensor& x) { + std::vector xdim_vec = phi::vectorize(x.dims()); + auto x_ndims = xdim_vec.size(); + int64_t M = xdim_vec[x_ndims - 2]; + int64_t N = xdim_vec[x_ndims - 1]; + int batch_size = 1; + for (int i = 0; i < x_ndims - 2; i++) { + batch_size *= xdim_vec[i]; + } + int64_t batch_nnz = x.nnz() / batch_size; + + const IntT* x_crows_data = x.non_zero_crows().data(); + const IntT* x_cols_data = x.non_zero_cols().data(); + const T* x_values_data = x.non_zero_elements().data(); + + SparseCsrTensor out; + DenseTensor out_crows = phi::Empty(dev_ctx, {N + 1}); + DenseTensor out_cols = phi::Empty(dev_ctx, {x.nnz()}); + DenseTensor out_values = phi::Empty(dev_ctx, {x.nnz()}); + out.SetMember(out_crows, out_cols, out_values, {N, M}); + const IntT* out_crows_data = out.non_zero_crows().data(); + const IntT* out_cols_data = out.non_zero_cols().data(); + const T* out_values_data = out.non_zero_elements().data(); + + cudaDataType_t gpu_type = GetGpuDataType(); + size_t buffer_size; + dev_ctx.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCsr2cscEx2_bufferSize( + handle, + M, + N, + batch_nnz, + const_cast(x_values_data), + const_cast(x_crows_data), + const_cast(x_cols_data), + const_cast(out_values_data), + const_cast(out_crows_data), + const_cast(out_cols_data), + gpu_type, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, + &buffer_size); + }); + + phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + buffer_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + void* tmp_buffer_ptr = tmp_buffer->ptr(); + + dev_ctx.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCsr2cscEx2(handle, + M, + N, + batch_nnz, + const_cast(x_values_data), + const_cast(x_crows_data), + const_cast(x_cols_data), + const_cast(out_values_data), + const_cast(out_crows_data), + const_cast(out_cols_data), + gpu_type, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, + tmp_buffer_ptr); + }); + return out; +} + template class CuSparseSpMatDescriptor { public: @@ -504,28 +577,11 @@ void SparseBlas::SPGEMM(bool transa, SparseCsrTensor* mat_out) const { auto a_descriptor = CuSparseSpMatDescriptor(mat_a, dev_ctx_); auto b_descriptor = CuSparseSpMatDescriptor(mat_b, dev_ctx_); + auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); cudaDataType_t gpu_type = GetGpuDataType(); size_t buffer_a_size = 0, buffer_b_size = 0; - std::vector out_dim_vec = phi::vectorize(mat_out->dims()); - auto out_ndims = out_dim_vec.size(); - int64_t M = out_dim_vec[out_ndims - 2]; - int64_t N = out_dim_vec[out_ndims - 1]; - - cusparseSpMatDescr_t out_descr; - phi::dynload::cusparseCreateCsr(&out_descr, - M, - N, - 0, - nullptr, - nullptr, - nullptr, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - gpu_type); - cusparseSpGEMMDescr_t spgemmDesc; phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { @@ -536,7 +592,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descr, + out_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -558,7 +614,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descr, + out_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -574,7 +630,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descr, + out_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -596,7 +652,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descr, + out_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, @@ -604,17 +660,14 @@ void SparseBlas::SPGEMM(bool transa, tmp_buffer_b_ptr); }); - int64_t num_rows, num_cols1, nnz; + int64_t num_rows, num_cols, nnz; dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpMatGetSize(out_descr, &num_rows, &num_cols1, &nnz); + phi::dynload::cusparseSpMatGetSize( + out_descriptor.descriptor(), &num_rows, &num_cols, &nnz); }); - - DenseTensor out_crows = phi::Empty(dev_ctx_, {M + 1}); - DenseTensor out_cols = phi::Empty(dev_ctx_, {nnz}); - DenseTensor out_values = phi::Empty(dev_ctx_, {nnz}); - mat_out->SetMember(out_crows, out_cols, out_values, mat_out->dims()); - - auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + *(mat_out->mutable_cols()) = phi::Empty(dev_ctx_, {nnz}); + *(mat_out->mutable_values()) = phi::Empty(dev_ctx_, {nnz}); + auto res_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_copy(handle, @@ -624,7 +677,7 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - out_descriptor.descriptor(), + res_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc); diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index 7dbdbe2acc992..a396dc220bd0c 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -139,6 +139,71 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx, #endif } +template +void MatmulCsrCsrGradKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const SparseCsrTensor& y, + const SparseCsrTensor& dout, + SparseCsrTensor* dx, + SparseCsrTensor* dy) { +#if CUDA_VERSION >= 11030 + auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); + + // dx{SparseCsr} = dout{Dense} * y'{Dense} + if (dx) { + // InferMeta of SparseCsrTensor 'dx', CreateLikeInferMeta + EmptyLikeCsrKernel(dev_ctx, x, dx); + // cusparseSPGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. + SparseCsrTensor trans_y = + phi::funcs::sparse::CSRTanspose(dev_ctx, y); + + sparse_blas.SPGEMM( + false, false, static_cast(1), dout, trans_y, static_cast(0), dx); + } + + // dy{Dense} = x'{SparseCsr} * dout{Dense} + if (dy) { + // InferMeta of DenseTensor 'dy' + EmptyLikeCsrKernel(dev_ctx, y, dy); + SparseCsrTensor trans_x = + phi::funcs::sparse::CSRTanspose(dev_ctx, x); + +#ifdef PADDLE_WITH_HIP + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dy, static_cast(0.0f)); +#endif + + sparse_blas.SPGEMM( + false, false, static_cast(1), trans_x, dout, static_cast(0), dy); + } +#else +#ifdef PADDLE_WITH_CUDA + PADDLE_THROW(phi::errors::Unimplemented( + "backward of 'sparse.matmul' use cusparseSPGEMM, which is supported from " + "CUDA 11.3")); +#endif +#endif +} + +template +void MatmulCooCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const SparseCooTensor& y, + const SparseCooTensor& dout, + SparseCooTensor* dx, + SparseCooTensor* dy) { + // 'cusparseSPGEMM' only support CSR now, so use COO->CSR->COO, + SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); + SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); + SparseCsrTensor dout_csr = CooToCsr(dev_ctx, dout); + SparseCsrTensor dx_csr, dy_csr; + dx_csr.set_dims(dx->dims()); + dy_csr.set_dims(dy->dims()); + MatmulCsrCsrGradKernel(dev_ctx, x_csr, y_csr, dout_csr, &dx_csr, &dy_csr); + CsrToCooKernel(dev_ctx, dx_csr, dx); + CsrToCooKernel(dev_ctx, dy_csr, dy); +} + template void MaskedMatmulCsrGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -211,6 +276,24 @@ PD_REGISTER_KERNEL(matmul_csr_dense_grad, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } +PD_REGISTER_KERNEL(matmul_csr_csr_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCsrCsrGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} + +PD_REGISTER_KERNEL(matmul_coo_coo_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCooCooGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + PD_REGISTER_KERNEL(masked_matmul_csr_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 7ce3bc26b7dad..3f79eddc052e9 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/matmul_kernel.h" #include -#include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/ddim.h" @@ -143,17 +142,33 @@ void MatmulKernelImpl(const Context& dev_ctx, "The shape of Input(x) and Input(y) is not suitable for matmul " "opetation, x_dim[-1] must be eaqual to y_dim[-2].")); + std::vector out_dim_vec = phi::vectorize(out->dims()); + int batch_size = 1; + for (int i = 0; i < out_dim_vec.size() - 2; i++) { + batch_size *= out_dim_vec[i]; + } + + PADDLE_ENFORCE_EQ( + batch_size, + 1, + phi::errors::InvalidArgument( + "Batched computation is not supported in cusparseSPGEMM.")); + + // cusparseSPGEMM only support 32-bit indices. + DenseTensor out_crows = + phi::Empty(dev_ctx, {xdim_vec[x_ndims - 2] + 1}); + DenseTensor out_cols = phi::Empty(dev_ctx, {0}); + DenseTensor out_values = phi::Empty(dev_ctx, {0}); + out->SetMember(out_crows, out_cols, out_values, out->dims()); + auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); sparse_blas.SPGEMM( false, false, static_cast(1), x, y, static_cast(0), out); #else #ifdef PADDLE_WITH_CUDA PADDLE_THROW(phi::errors::Unimplemented( - "forward of 'sparse.matmul' use cusparseSpGEMM, " + "forward of 'sparse.matmul' use cusparseSPGEMM, " "which is supported from CUDA 11.0")); -#elif defined(PADDLE_WITH_HIP) - PADDLE_THROW( - phi::errors::Unimplemented("'sparse.matmul' for HIP is not implemented")); #endif #endif } diff --git a/paddle/phi/kernels/sparse/matmul_kernel.h b/paddle/phi/kernels/sparse/matmul_kernel.h index a261bbf3cd3f7..8ea921f57fce6 100644 --- a/paddle/phi/kernels/sparse/matmul_kernel.h +++ b/paddle/phi/kernels/sparse/matmul_kernel.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace phi { namespace sparse { -// TODO(zhouwei25): implement " COO @ COO -> COO" +/* COO @ COO -> COO */ template void MatmulCooCooKernel(const Context& dev_ctx, const SparseCooTensor& x, @@ -35,7 +35,7 @@ void MatmulCooDenseKernel(const Context& dev_ctx, const DenseTensor& y, DenseTensor* out); -// TODO(zhouwei25): implement " CSR @ CSR -> CSR" +/* CSR @ CSR -> CSR */ template void MatmulCsrCsrKernel(const Context& dev_ctx, const SparseCsrTensor& x, diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index d4d32cf17cd56..adc10462f42cf 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -96,7 +96,8 @@ def test_matmul_3d(self): class TestMatmulCSR(unittest.TestCase): # x: csr sparse, y: csr sparse, out: csr sparse def check_result(self, x_shape, y_shape): - origin_x = paddle.rand(x_shape) + mask = paddle.randint(0, 2, x_shape) + origin_x = paddle.rand(x_shape) * mask origin_y = paddle.rand(y_shape) dense_x = origin_x.detach() @@ -129,18 +130,17 @@ def check_result(self, x_shape, y_shape): np.testing.assert_allclose( sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 ) - # if get_cuda_version() >= 11030: - # dense_out.backward() - # sp_out.backward() - # print(sp_x.grad) - # np.testing.assert_allclose( - # sp_x.grad.to_dense().numpy(), - # dense_x.grad.numpy(), - # rtol=1e-05, - # ) - # np.testing.assert_allclose( - # sp_y.grad.numpy(), dense_y.grad.numpy(), rtol=1e-05 - # ) + if get_cuda_version() >= 11030: + dense_out.backward() + sp_out.backward() + np.testing.assert_allclose( + sp_x.grad.to_dense().numpy(), + dense_x.grad.numpy(), + rtol=1e-05, + ) + np.testing.assert_allclose( + sp_y.grad.to_dense().numpy(), dense_y.grad.numpy(), rtol=1e-05 + ) @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, @@ -148,13 +148,13 @@ def check_result(self, x_shape, y_shape): ) def test_matmul_2d(self): self.check_result([16, 12], [12, 10]) - self.check_result([8, 16, 12], [8, 12, 10]) class TestMatmulCOO(unittest.TestCase): - # x: csr sparse, y: csr sparse, out: csr sparse + # x: coo sparse, y: coo sparse, out: coo sparse def check_result(self, x_shape, y_shape): - origin_x = paddle.rand(x_shape) + mask = paddle.randint(0, 2, x_shape) + origin_x = paddle.rand(x_shape) * mask origin_y = paddle.rand(y_shape) dense_x = origin_x.detach() @@ -186,6 +186,18 @@ def check_result(self, x_shape, y_shape): sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 ) + if get_cuda_version() >= 11030: + dense_out.backward() + sp_out.backward() + np.testing.assert_allclose( + sp_x.grad.to_dense().numpy(), + dense_x.grad.numpy(), + rtol=1e-05, + ) + np.testing.assert_allclose( + sp_y.grad.to_dense().numpy(), dense_y.grad.numpy(), rtol=1e-05 + ) + @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, "only support cuda>=11.0", From ff405582c6cedfa58b286a40e0ac2816d202bec9 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Thu, 14 Dec 2023 03:28:22 +0000 Subject: [PATCH 07/25] fix --- paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 663f6725c4f47..911e329387fa6 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -224,7 +224,7 @@ SparseCsrTensor CSRTanspose(const phi::GPUContext& dev_ctx, gpu_type, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG_DEFAULT, + CUSPARSE_CSR2CSC_ALG1, &buffer_size); }); @@ -248,7 +248,7 @@ SparseCsrTensor CSRTanspose(const phi::GPUContext& dev_ctx, gpu_type, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG_DEFAULT, + CUSPARSE_CSR2CSC_ALG1, tmp_buffer_ptr); }); return out; From 03a08cc18192cf1b023de6e6121c9c52f7b68e07 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Fri, 15 Dec 2023 02:47:15 +0000 Subject: [PATCH 08/25] fix --- paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index a396dc220bd0c..66315b94be9df 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -168,11 +168,6 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, SparseCsrTensor trans_x = phi::funcs::sparse::CSRTanspose(dev_ctx, x); -#ifdef PADDLE_WITH_HIP - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, dy, static_cast(0.0f)); -#endif - sparse_blas.SPGEMM( false, false, static_cast(1), trans_x, dout, static_cast(0), dy); } From 14deb877786aec4be5caaa0d9348a395ee628c42 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 19 Dec 2023 06:59:36 +0000 Subject: [PATCH 09/25] fix 3d --- paddle/fluid/platform/dynload/cusparse.h | 4 +- paddle/phi/backends/dynload/cusparse.h | 4 +- .../funcs/sparse/sparse_blas_impl.cu.h | 75 +------ .../kernels/sparse/gpu/matmul_grad_kernel.cu | 52 +++-- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 61 ++---- .../kernels/sparse/gpu/transpose_kernel.cu | 204 ++++++++++-------- test/legacy_test/test_sparse_matmul_op.py | 20 +- 7 files changed, 189 insertions(+), 231 deletions(-) diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index abc1ef2c3e9ce..cbc09ed47c1bf 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -55,9 +55,7 @@ namespace dynload { __macro(cusparseSpGEMM_compute); \ __macro(cusparseSpGEMM_workEstimation); \ __macro(cusparseSpGEMM_copy); \ - __macro(cusparseSpGEMM_destroyDescr); \ - __macro(cusparseCsr2cscEx2_bufferSize); \ - __macro(cusparseCsr2cscEx2); + __macro(cusparseSpGEMM_destroyDescr); CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h index 974355d10c441..6d90cb5eefdca 100644 --- a/paddle/phi/backends/dynload/cusparse.h +++ b/paddle/phi/backends/dynload/cusparse.h @@ -67,9 +67,7 @@ extern void *cusparse_dso_handle; __macro(cusparseSpGEMM_compute); \ __macro(cusparseSpGEMM_workEstimation); \ __macro(cusparseSpGEMM_copy); \ - __macro(cusparseSpGEMM_destroyDescr); \ - __macro(cusparseCsr2cscEx2_bufferSize); \ - __macro(cusparseCsr2cscEx2); + __macro(cusparseSpGEMM_destroyDescr); CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) #endif diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 911e329387fa6..6474b34e7284a 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -181,79 +181,6 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, } } -template -SparseCsrTensor CSRTanspose(const phi::GPUContext& dev_ctx, - const phi::SparseCsrTensor& x) { - std::vector xdim_vec = phi::vectorize(x.dims()); - auto x_ndims = xdim_vec.size(); - int64_t M = xdim_vec[x_ndims - 2]; - int64_t N = xdim_vec[x_ndims - 1]; - int batch_size = 1; - for (int i = 0; i < x_ndims - 2; i++) { - batch_size *= xdim_vec[i]; - } - int64_t batch_nnz = x.nnz() / batch_size; - - const IntT* x_crows_data = x.non_zero_crows().data(); - const IntT* x_cols_data = x.non_zero_cols().data(); - const T* x_values_data = x.non_zero_elements().data(); - - SparseCsrTensor out; - DenseTensor out_crows = phi::Empty(dev_ctx, {N + 1}); - DenseTensor out_cols = phi::Empty(dev_ctx, {x.nnz()}); - DenseTensor out_values = phi::Empty(dev_ctx, {x.nnz()}); - out.SetMember(out_crows, out_cols, out_values, {N, M}); - const IntT* out_crows_data = out.non_zero_crows().data(); - const IntT* out_cols_data = out.non_zero_cols().data(); - const T* out_values_data = out.non_zero_elements().data(); - - cudaDataType_t gpu_type = GetGpuDataType(); - size_t buffer_size; - dev_ctx.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseCsr2cscEx2_bufferSize( - handle, - M, - N, - batch_nnz, - const_cast(x_values_data), - const_cast(x_crows_data), - const_cast(x_cols_data), - const_cast(out_values_data), - const_cast(out_crows_data), - const_cast(out_cols_data), - gpu_type, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &buffer_size); - }); - - phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - buffer_size, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - void* tmp_buffer_ptr = tmp_buffer->ptr(); - - dev_ctx.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseCsr2cscEx2(handle, - M, - N, - batch_nnz, - const_cast(x_values_data), - const_cast(x_crows_data), - const_cast(x_cols_data), - const_cast(out_values_data), - const_cast(out_crows_data), - const_cast(out_cols_data), - gpu_type, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - tmp_buffer_ptr); - }); - return out; -} - template class CuSparseSpMatDescriptor { public: @@ -682,6 +609,8 @@ void SparseBlas::SPGEMM(bool transa, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc); }); + + phi::dynload::cusparseSpGEMM_destroyDescr(spgemmDesc); } } // namespace sparse } // namespace funcs diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index 66315b94be9df..f425968f433d3 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" +#include "paddle/phi/kernels/sparse/unary_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" namespace phi { @@ -149,13 +150,22 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, #if CUDA_VERSION >= 11030 auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); + std::vector xdim_vec = phi::vectorize(x.dims()); + auto x_ndims = xdim_vec.size(); + std::vector perm; + if (x_ndims == 2) { + perm = {1, 0}; + } else { + perm = {0, 2, 1}; + } + // dx{SparseCsr} = dout{Dense} * y'{Dense} if (dx) { // InferMeta of SparseCsrTensor 'dx', CreateLikeInferMeta EmptyLikeCsrKernel(dev_ctx, x, dx); // cusparseSPGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. - SparseCsrTensor trans_y = - phi::funcs::sparse::CSRTanspose(dev_ctx, y); + SparseCsrTensor trans_y; + TransposeCsrKernel(dev_ctx, y, perm, &trans_y); sparse_blas.SPGEMM( false, false, static_cast(1), dout, trans_y, static_cast(0), dx); @@ -165,8 +175,8 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, if (dy) { // InferMeta of DenseTensor 'dy' EmptyLikeCsrKernel(dev_ctx, y, dy); - SparseCsrTensor trans_x = - phi::funcs::sparse::CSRTanspose(dev_ctx, x); + SparseCsrTensor trans_x; + TransposeCsrKernel(dev_ctx, x, perm, &trans_x); sparse_blas.SPGEMM( false, false, static_cast(1), trans_x, dout, static_cast(0), dy); @@ -271,23 +281,23 @@ PD_REGISTER_KERNEL(matmul_csr_dense_grad, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -PD_REGISTER_KERNEL(matmul_csr_csr_grad, - GPU, - ALL_LAYOUT, - phi::sparse::MatmulCsrCsrGradKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); -} - -PD_REGISTER_KERNEL(matmul_coo_coo_grad, - GPU, - ALL_LAYOUT, - phi::sparse::MatmulCooCooGradKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} +// PD_REGISTER_KERNEL(matmul_csr_csr_grad, +// GPU, +// ALL_LAYOUT, +// phi::sparse::MatmulCsrCsrGradKernel, +// float, +// double) { +// kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +// } + +// PD_REGISTER_KERNEL(matmul_coo_coo_grad, +// GPU, +// ALL_LAYOUT, +// phi::sparse::MatmulCooCooGradKernel, +// float, +// double) { +// kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +// } PD_REGISTER_KERNEL(masked_matmul_csr_grad, GPU, diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 3f79eddc052e9..520e8112c6742 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -104,10 +104,26 @@ void MatmulKernelImpl(const Context& dev_ctx, } template -void MatmulKernelImpl(const Context& dev_ctx, - const SparseCsrTensor& x, - const SparseCsrTensor& y, - SparseCsrTensor* out) { +void MatmulCooDenseKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& y, + DenseTensor* out) { + MatmulKernelImpl(dev_ctx, x, y, out); +} + +template +void MatmulCsrDenseKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const DenseTensor& y, + DenseTensor* out) { + MatmulKernelImpl(dev_ctx, x, y, out); +} + +template +void MatmulCsrCsrKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const SparseCsrTensor& y, + SparseCsrTensor* out) { #if CUDA_VERSION >= 11000 std::vector xdim_vec = phi::vectorize(x.dims()); std::vector ydim_vec = phi::vectorize(y.dims()); @@ -148,15 +164,8 @@ void MatmulKernelImpl(const Context& dev_ctx, batch_size *= out_dim_vec[i]; } - PADDLE_ENFORCE_EQ( - batch_size, - 1, - phi::errors::InvalidArgument( - "Batched computation is not supported in cusparseSPGEMM.")); - - // cusparseSPGEMM only support 32-bit indices. - DenseTensor out_crows = - phi::Empty(dev_ctx, {xdim_vec[x_ndims - 2] + 1}); + int64_t out_crows_size = batch_size * (xdim_vec[x_ndims - 2] + 1); + DenseTensor out_crows = phi::Empty(dev_ctx, {out_crows_size}); DenseTensor out_cols = phi::Empty(dev_ctx, {0}); DenseTensor out_values = phi::Empty(dev_ctx, {0}); out->SetMember(out_crows, out_cols, out_values, out->dims()); @@ -173,30 +182,6 @@ void MatmulKernelImpl(const Context& dev_ctx, #endif } -template -void MatmulCooDenseKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& y, - DenseTensor* out) { - MatmulKernelImpl(dev_ctx, x, y, out); -} - -template -void MatmulCsrDenseKernel(const Context& dev_ctx, - const SparseCsrTensor& x, - const DenseTensor& y, - DenseTensor* out) { - MatmulKernelImpl(dev_ctx, x, y, out); -} - -template -void MatmulCsrCsrKernel(const Context& dev_ctx, - const SparseCsrTensor& x, - const SparseCsrTensor& y, - SparseCsrTensor* out) { - MatmulKernelImpl(dev_ctx, x, y, out); -} - template void MatmulCooCooKernel(const Context& dev_ctx, const SparseCooTensor& x, @@ -207,7 +192,7 @@ void MatmulCooCooKernel(const Context& dev_ctx, SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); SparseCsrTensor out_csr; out_csr.set_dims(out->dims()); - MatmulKernelImpl(dev_ctx, x_csr, y_csr, &out_csr); + MatmulCsrCsrKernel(dev_ctx, x_csr, y_csr, &out_csr); CsrToCooKernel(dev_ctx, out_csr, out); } diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu index cc67cb0021ddc..cdb5ae5036949 100644 --- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu @@ -17,6 +17,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" @@ -24,51 +25,51 @@ namespace phi { namespace sparse { -__global__ void TransposeCooCudaKernel(const int64_t *x_indices_data, +template +__global__ void TransposeCooCudaKernel(const IntT *x_indices_data, const int *perm, const std::size_t n_dim, - const int64_t x_nnz, - int64_t *out_indices_data) { - CUDA_KERNEL_LOOP_TYPE(index, x_nnz * n_dim, int64_t) { - int64_t i = index / x_nnz; - int64_t j = index % x_nnz; + const IntT x_nnz, + IntT *out_indices_data) { + CUDA_KERNEL_LOOP_TYPE(index, x_nnz * n_dim, IntT) { + IntT i = index / x_nnz; + IntT j = index % x_nnz; out_indices_data[index] = x_indices_data[j + perm[i] * x_nnz]; } } -template -__global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data, - const int64_t *x_cols_data, +template +__global__ void TransposeCsr2DCudaKernel(const IntT *x_crows_data, + const IntT *x_cols_data, const T *x_values_data, const int *perm, - const int64_t *x_dims, - const int64_t *out_dims, - const int64_t x_nnz, - int64_t *out_crows_data, - int64_t *out_cols_data, + const IntT *x_dims, + const IntT *out_dims, + const IntT x_nnz, + IntT *out_crows_data, + IntT *out_cols_data, T *out_values_data) { - int64_t __index__ = - static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + IntT __index__ = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; // compute out_crows_data by x_cols_data - for (int64_t i = __index__; i <= out_dims[0]; i += blockDim.x * gridDim.x) { + for (IntT i = __index__; i <= out_dims[0]; i += blockDim.x * gridDim.x) { out_crows_data[i] = 0; } __syncthreads(); if (__index__ == 0) { - for (int64_t i = 0; i < x_nnz; ++i) { + for (IntT i = 0; i < x_nnz; ++i) { int j = x_cols_data[i]; out_crows_data[j + 2]++; } - for (int64_t i = 0; i < out_dims[0]; i += 1) { + for (IntT i = 0; i < out_dims[0]; i += 1) { out_crows_data[i + 1] += out_crows_data[i]; } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_dims[0]; ++i) { - int64_t start = x_crows_data[i]; - int64_t end = x_crows_data[i + 1]; - for (int64_t j = start; j < end; ++j) { - int64_t x_cols_j = x_cols_data[j] + 1; - int64_t jjj = out_crows_data[x_cols_j]; + IntT start = x_crows_data[i]; + IntT end = x_crows_data[i + 1]; + for (IntT j = start; j < end; ++j) { + IntT x_cols_j = x_cols_data[j] + 1; + IntT jjj = out_crows_data[x_cols_j]; out_cols_data[jjj] = i; out_values_data[jjj] = x_values_data[j]; out_crows_data[x_cols_j]++; @@ -77,20 +78,19 @@ __global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data, } } -template -__global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, - const int64_t *x_cols_data, +template +__global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, + const IntT *x_cols_data, const T *x_values_data, const int *perm, - const int64_t *x_dims, - const int64_t *out_dims, + const IntT *x_dims, + const IntT *out_dims, const std::size_t n_dim, - const int64_t x_nnz, - int64_t *out_crows_data, - int64_t *out_cols_data, + const IntT x_nnz, + IntT *out_crows_data, + IntT *out_cols_data, T *out_values_data) { - int64_t __index__ = - static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + IntT __index__ = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; if (__index__ == 0) { int out_n_rows = out_dims[1]; int x_n_rows = x_dims[1]; @@ -109,11 +109,11 @@ __global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_n_rows; ++i) { - int64_t start = x_crows_data[i]; - int64_t end = x_crows_data[i + 1]; - for (int64_t j = start; j < end; ++j) { - int64_t x_cols_j = x_cols_data[j] + 1; - int64_t jjj = out_crows_data[x_cols_j]; + IntT start = x_crows_data[i]; + IntT end = x_crows_data[i + 1]; + for (IntT j = start; j < end; ++j) { + IntT x_cols_j = x_cols_data[j] + 1; + IntT jjj = out_crows_data[x_cols_j]; out_cols_data[jjj] = i; out_values_data[jjj] = x_values_data[j]; out_crows_data[x_cols_j]++; @@ -153,23 +153,23 @@ __global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, } } -template -void TransposeCooKernel(const Context &dev_ctx, - const SparseCooTensor &x, - const std::vector &perm, - SparseCooTensor *out) { +template +void TransposeCooImpl(const Context &dev_ctx, + const SparseCooTensor &x, + const std::vector &perm, + SparseCooTensor *out) { // create out sparse tensor - int64_t x_nnz = x.nnz(); + IntT x_nnz = x.nnz(); std::size_t n_dim = perm.size(); DDim out_dims = x.dims().transpose(perm); - DenseTensor out_indices = EmptyLike(dev_ctx, x.indices()); + DenseTensor out_indices = EmptyLike(dev_ctx, x.indices()); DenseTensor out_values(x.values()); out->SetMember(out_indices, out_values, out_dims, x.coalesced()); // compute values of indices const DenseTensor &x_indices = x.indices(); - const auto *x_indices_data = x_indices.data(); - auto *out_indices_data = out_indices.data(); + const auto *x_indices_data = x_indices.data(); + auto *out_indices_data = out_indices.data(); int *d_perm; auto d_perm_tensor = memory_utils::Alloc( @@ -193,10 +193,21 @@ void TransposeCooKernel(const Context &dev_ctx, } template -void TransposeCsrKernel(const Context &dev_ctx, - const SparseCsrTensor &x, +void TransposeCooKernel(const Context &dev_ctx, + const SparseCooTensor &x, const std::vector &perm, - SparseCsrTensor *out) { + SparseCooTensor *out) { + PD_VISIT_BASE_INTEGRAL_TYPES(x.indices().dtype(), "TransposeCooKernel", ([&] { + TransposeCooImpl( + dev_ctx, x, perm, out); + })); +} + +template +void TransposeCsrImpl(const Context &dev_ctx, + const SparseCsrTensor &x, + const std::vector &perm, + SparseCsrTensor *out) { std::size_t n_dim = perm.size(); const DenseTensor &x_crows = x.crows(); const DenseTensor &x_cols = x.cols(); @@ -213,12 +224,12 @@ void TransposeCsrKernel(const Context &dev_ctx, // create out sparse tensor DDim out_dims = x.dims().transpose(perm); if (n_dim == 2) { - out_crows = Empty(dev_ctx, {out_dims[0] + 1}); + out_crows = Empty(dev_ctx, {out_dims[0] + 1}); } else { out_crows = - Empty(dev_ctx, {out_dims[0] * (out_dims[1] + 1)}); + Empty(dev_ctx, {out_dims[0] * (out_dims[1] + 1)}); } - out_cols = EmptyLike(dev_ctx, x.cols()); + out_cols = EmptyLike(dev_ctx, x.cols()); out_values = EmptyLike(dev_ctx, x.values()); out->SetMember(out_crows, out_cols, out_values, out_dims); // transpose by two stages @@ -238,14 +249,14 @@ void TransposeCsrKernel(const Context &dev_ctx, TransposeCsrKernel(dev_ctx, temp, {2, 0, 1}, out); return; } - int64_t *out_crows_data = out_crows.data(); - int64_t *out_cols_data = out_cols.data(); + IntT *out_crows_data = out_crows.data(); + IntT *out_cols_data = out_cols.data(); T *out_values_data = out_values.data(); - const int64_t *x_crows_data = x_crows.data(); - const int64_t *x_cols_data = x_cols.data(); + const IntT *x_crows_data = x_crows.data(); + const IntT *x_cols_data = x_cols.data(); const T *x_values_data = x_values.data(); int *d_perm; - int64_t *d_x_dims, *d_out_dims; + IntT *d_x_dims, *d_out_dims; auto d_perm_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), @@ -260,58 +271,71 @@ void TransposeCsrKernel(const Context &dev_ctx, dev_ctx.stream()); auto d_x_dims_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), - sizeof(int64_t) * x.dims().size(), + sizeof(IntT) * x.dims().size(), phi::Stream(reinterpret_cast(dev_ctx.stream()))); - d_x_dims = reinterpret_cast(d_x_dims_tensor->ptr()); + d_x_dims = reinterpret_cast(d_x_dims_tensor->ptr()); memory_utils::Copy(dev_ctx.GetPlace(), d_x_dims, phi::CPUPlace(), x.dims().Get(), - sizeof(int64_t) * x.dims().size(), + sizeof(IntT) * x.dims().size(), dev_ctx.stream()); auto d_out_dims_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), - sizeof(int64_t) * out_dims.size(), + sizeof(IntT) * out_dims.size(), phi::Stream(reinterpret_cast(dev_ctx.stream()))); - d_out_dims = reinterpret_cast(d_out_dims_tensor->ptr()); + d_out_dims = reinterpret_cast(d_out_dims_tensor->ptr()); memory_utils::Copy(dev_ctx.GetPlace(), d_out_dims, phi::CPUPlace(), out_dims.Get(), - sizeof(int64_t) * out_dims.size(), + sizeof(IntT) * out_dims.size(), dev_ctx.stream()); - int64_t x_nnz = x.nnz(); + IntT x_nnz = x.nnz(); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1); if (perm.size() == 2) { - TransposeCsr2DCudaKernel<<>>(x_crows_data, - x_cols_data, - x_values_data, - d_perm, - d_x_dims, - d_out_dims, - x_nnz, - out_crows_data, - out_cols_data, - out_values_data); + TransposeCsr2DCudaKernel<<>>(x_crows_data, + x_cols_data, + x_values_data, + d_perm, + d_x_dims, + d_out_dims, + x_nnz, + out_crows_data, + out_cols_data, + out_values_data); } else { - TransposeCsr3DCudaKernel<<<1, 1, 0, dev_ctx.stream()>>>(x_crows_data, - x_cols_data, - x_values_data, - d_perm, - d_x_dims, - d_out_dims, - perm.size(), - x_nnz, - out_crows_data, - out_cols_data, - out_values_data); + TransposeCsr3DCudaKernel + <<<1, 1, 0, dev_ctx.stream()>>>(x_crows_data, + x_cols_data, + x_values_data, + d_perm, + d_x_dims, + d_out_dims, + perm.size(), + x_nnz, + out_crows_data, + out_cols_data, + out_values_data); } } + +template +void TransposeCsrKernel(const Context &dev_ctx, + const SparseCsrTensor &x, + const std::vector &perm, + SparseCsrTensor *out) { + PD_VISIT_BASE_INTEGRAL_TYPES(x.crows().dtype(), "TransposeCsrKernel", ([&] { + TransposeCsrImpl( + dev_ctx, x, perm, out); + })); +} + } // namespace sparse } // namespace phi diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index adc10462f42cf..aa4b739347152 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -36,7 +36,7 @@ def get_cuda_version(): return -1 -class TestMatmul(unittest.TestCase): +class TestMatmulSparseDense(unittest.TestCase): # x: sparse, y: dense, out: dense def check_result(self, x_shape, y_shape, format): if len(x_shape) == 3: @@ -93,7 +93,7 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmulCSR(unittest.TestCase): +class TestMatmulCSRCSR(unittest.TestCase): # x: csr sparse, y: csr sparse, out: csr sparse def check_result(self, x_shape, y_shape): mask = paddle.randint(0, 2, x_shape) @@ -149,8 +149,15 @@ def check_result(self, x_shape, y_shape): def test_matmul_2d(self): self.check_result([16, 12], [12, 10]) + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, + "only support cuda>=11.8", + ) + def test_matmul_3d(self): + self.check_result([8, 16, 12], [8, 12, 10]) + -class TestMatmulCOO(unittest.TestCase): +class TestMatmulCOOCOO(unittest.TestCase): # x: coo sparse, y: coo sparse, out: coo sparse def check_result(self, x_shape, y_shape): mask = paddle.randint(0, 2, x_shape) @@ -205,6 +212,13 @@ def check_result(self, x_shape, y_shape): def test_matmul_2d(self): self.check_result([16, 12], [12, 10]) + @unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, + "only support cuda>=11.8", + ) + def test_matmul_3d(self): + self.check_result([8, 16, 12], [8, 12, 10]) + class TestMaskedMatmul(unittest.TestCase): # x: dense, y: dense, out: sparse_`csr From a2019d187fe58c036fff2f378e507014e5551323 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 19 Dec 2023 07:11:52 +0000 Subject: [PATCH 10/25] codestyle --- paddle/phi/kernels/funcs/sparse/sparse_blas.h | 4 ---- test/legacy_test/test_sparse_matmul_op.py | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h index ca6e60842ec93..f180b043444d1 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h @@ -22,10 +22,6 @@ namespace phi { namespace funcs { namespace sparse { -template -SparseCsrTensor CSRTanspose(const phi::GPUContext& dev_ctx, - const phi::SparseCsrTensor& x); - template class SparseBlas { public: diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index aa4b739347152..8ba0223e8112d 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -93,7 +93,7 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmulCSRCSR(unittest.TestCase): +class TestMatmulCsrCsr(unittest.TestCase): # x: csr sparse, y: csr sparse, out: csr sparse def check_result(self, x_shape, y_shape): mask = paddle.randint(0, 2, x_shape) @@ -157,7 +157,7 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10]) -class TestMatmulCOOCOO(unittest.TestCase): +class TestMatmulCooCoo(unittest.TestCase): # x: coo sparse, y: coo sparse, out: coo sparse def check_result(self, x_shape, y_shape): mask = paddle.randint(0, 2, x_shape) From 67bb445ad6f6e430b7b804e42cda055b2d0979bf Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 19 Dec 2023 07:34:02 +0000 Subject: [PATCH 11/25] fix --- .../kernels/sparse/gpu/matmul_grad_kernel.cu | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index f425968f433d3..c27ca5506d87b 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -281,23 +281,23 @@ PD_REGISTER_KERNEL(matmul_csr_dense_grad, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -// PD_REGISTER_KERNEL(matmul_csr_csr_grad, -// GPU, -// ALL_LAYOUT, -// phi::sparse::MatmulCsrCsrGradKernel, -// float, -// double) { -// kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); -// } - -// PD_REGISTER_KERNEL(matmul_coo_coo_grad, -// GPU, -// ALL_LAYOUT, -// phi::sparse::MatmulCooCooGradKernel, -// float, -// double) { -// kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -// } +PD_REGISTER_KERNEL(matmul_csr_csr_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCsrCsrGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} + +PD_REGISTER_KERNEL(matmul_coo_coo_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCooCooGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} PD_REGISTER_KERNEL(masked_matmul_csr_grad, GPU, From 0ee354510f97a3fd55bc4e87bdf9fe6603b655d3 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 19 Dec 2023 21:04:53 +0800 Subject: [PATCH 12/25] fix --- .../kernels/sparse/gpu/transpose_kernel.cu | 45 +++++++------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu index cdb5ae5036949..24c22af84429c 100644 --- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu @@ -24,16 +24,14 @@ namespace phi { namespace sparse { - -template -__global__ void TransposeCooCudaKernel(const IntT *x_indices_data, +__global__ void TransposeCooCudaKernel(const int64_t *x_indices_data, const int *perm, const std::size_t n_dim, - const IntT x_nnz, - IntT *out_indices_data) { - CUDA_KERNEL_LOOP_TYPE(index, x_nnz * n_dim, IntT) { - IntT i = index / x_nnz; - IntT j = index % x_nnz; + const int64_t x_nnz, + int64_t *out_indices_data) { + CUDA_KERNEL_LOOP_TYPE(index, x_nnz * n_dim, int64_t) { + int64_t i = index / x_nnz; + int64_t j = index % x_nnz; out_indices_data[index] = x_indices_data[j + perm[i] * x_nnz]; } } @@ -153,23 +151,23 @@ __global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, } } -template -void TransposeCooImpl(const Context &dev_ctx, - const SparseCooTensor &x, - const std::vector &perm, - SparseCooTensor *out) { +template +void TransposeCooKernel(const Context &dev_ctx, + const SparseCooTensor &x, + const std::vector &perm, + SparseCooTensor *out) { // create out sparse tensor - IntT x_nnz = x.nnz(); + int64_t x_nnz = x.nnz(); std::size_t n_dim = perm.size(); DDim out_dims = x.dims().transpose(perm); - DenseTensor out_indices = EmptyLike(dev_ctx, x.indices()); + DenseTensor out_indices = EmptyLike(dev_ctx, x.indices()); DenseTensor out_values(x.values()); out->SetMember(out_indices, out_values, out_dims, x.coalesced()); // compute values of indices const DenseTensor &x_indices = x.indices(); - const auto *x_indices_data = x_indices.data(); - auto *out_indices_data = out_indices.data(); + const auto *x_indices_data = x_indices.data(); + auto *out_indices_data = out_indices.data(); int *d_perm; auto d_perm_tensor = memory_utils::Alloc( @@ -192,17 +190,6 @@ void TransposeCooImpl(const Context &dev_ctx, x_indices_data, d_perm, n_dim, x_nnz, out_indices_data); } -template -void TransposeCooKernel(const Context &dev_ctx, - const SparseCooTensor &x, - const std::vector &perm, - SparseCooTensor *out) { - PD_VISIT_BASE_INTEGRAL_TYPES(x.indices().dtype(), "TransposeCooKernel", ([&] { - TransposeCooImpl( - dev_ctx, x, perm, out); - })); -} - template void TransposeCsrImpl(const Context &dev_ctx, const SparseCsrTensor &x, @@ -292,7 +279,7 @@ void TransposeCsrImpl(const Context &dev_ctx, sizeof(IntT) * out_dims.size(), dev_ctx.stream()); - IntT x_nnz = x.nnz(); + IntT x_nnz = static_cast(x.nnz()); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1); if (perm.size() == 2) { From c8ff3d5c309532cd3f7e2268e92dfe949a66ced1 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Thu, 21 Dec 2023 14:49:35 +0000 Subject: [PATCH 13/25] use CastCsrKernel --- .../kernels/sparse/gpu/matmul_grad_kernel.cu | 73 ++++++-- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 26 ++- .../kernels/sparse/gpu/transpose_kernel.cu | 167 ++++++++---------- .../kernels/sparse/impl/unary_kernel_impl.h | 3 +- test/legacy_test/test_sparse_matmul_op.py | 25 --- 5 files changed, 159 insertions(+), 135 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index c27ca5506d87b..b78e96c891b91 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -159,27 +159,66 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, perm = {0, 2, 1}; } - // dx{SparseCsr} = dout{Dense} * y'{Dense} + // cusparseSpGEMM only support 32-bit index. + SparseCsrTensor dout_tmp; + CastCsrKernel( + dev_ctx, dout, DataType::INT32, dout.values().dtype(), &dout_tmp); + + // dx{SparseCsr} = dout{SparseCsr} * y'{SparseCsr} if (dx) { - // InferMeta of SparseCsrTensor 'dx', CreateLikeInferMeta - EmptyLikeCsrKernel(dev_ctx, x, dx); - // cusparseSPGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. - SparseCsrTensor trans_y; - TransposeCsrKernel(dev_ctx, y, perm, &trans_y); + SparseCsrTensor x_tmp, dx_tmp; + CastCsrKernel( + dev_ctx, x, DataType::INT32, x.values().dtype(), &x_tmp); + + EmptyLikeCsrKernel(dev_ctx, x_tmp, &dx_tmp); - sparse_blas.SPGEMM( - false, false, static_cast(1), dout, trans_y, static_cast(0), dx); + // cusparseSpGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. + SparseCsrTensor trans_y, trans_y_tmp; + TransposeCsrKernel(dev_ctx, y, perm, &trans_y); + CastCsrKernel(dev_ctx, + trans_y, + DataType::INT32, + trans_y.values().dtype(), + &trans_y_tmp); + + sparse_blas.SPGEMM(false, + false, + static_cast(1), + dout_tmp, + trans_y_tmp, + static_cast(0), + &dx_tmp); + + CastCsrKernel( + dev_ctx, dx_tmp, DataType::INT64, dx_tmp.values().dtype(), dx); } - // dy{Dense} = x'{SparseCsr} * dout{Dense} + // dy{SparseCsr} = x'{SparseCsr} * dout{SparseCsr} if (dy) { - // InferMeta of DenseTensor 'dy' - EmptyLikeCsrKernel(dev_ctx, y, dy); - SparseCsrTensor trans_x; - TransposeCsrKernel(dev_ctx, x, perm, &trans_x); + SparseCsrTensor y_tmp, dy_tmp; + CastCsrKernel( + dev_ctx, y, DataType::INT32, y.values().dtype(), &y_tmp); + EmptyLikeCsrKernel(dev_ctx, y_tmp, &dy_tmp); - sparse_blas.SPGEMM( - false, false, static_cast(1), trans_x, dout, static_cast(0), dy); + // cusparseSpGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. + SparseCsrTensor trans_x, trans_x_tmp; + TransposeCsrKernel(dev_ctx, x, perm, &trans_x); + CastCsrKernel(dev_ctx, + trans_x, + DataType::INT32, + trans_x.values().dtype(), + &trans_x_tmp); + + sparse_blas.SPGEMM(false, + false, + static_cast(1), + trans_x_tmp, + dout_tmp, + static_cast(0), + &dy_tmp); + + CastCsrKernel( + dev_ctx, dy_tmp, DataType::INT64, dy_tmp.values().dtype(), dy); } #else #ifdef PADDLE_WITH_CUDA @@ -197,7 +236,7 @@ void MatmulCooCooGradKernel(const Context& dev_ctx, const SparseCooTensor& dout, SparseCooTensor* dx, SparseCooTensor* dy) { - // 'cusparseSPGEMM' only support CSR now, so use COO->CSR->COO, + // cusparseSpGEMM only support CSR now, so use COO->CSR->COO SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); SparseCsrTensor dout_csr = CooToCsr(dev_ctx, dout); @@ -288,6 +327,7 @@ PD_REGISTER_KERNEL(matmul_csr_csr_grad, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR); } PD_REGISTER_KERNEL(matmul_coo_coo_grad, @@ -297,6 +337,7 @@ PD_REGISTER_KERNEL(matmul_coo_coo_grad, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } PD_REGISTER_KERNEL(masked_matmul_csr_grad, diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 520e8112c6742..3216dbcaadbaa 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function_impl.h" #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h" #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" namespace phi { @@ -158,21 +159,36 @@ void MatmulCsrCsrKernel(const Context& dev_ctx, "The shape of Input(x) and Input(y) is not suitable for matmul " "opetation, x_dim[-1] must be eaqual to y_dim[-2].")); + // cusparseSpGEMM only support 32-bit index. + SparseCsrTensor x_tmp, y_tmp, out_tmp; + CastCsrKernel( + dev_ctx, x, DataType::INT32, x.values().dtype(), &x_tmp); + CastCsrKernel( + dev_ctx, y, DataType::INT32, y.values().dtype(), &y_tmp); + std::vector out_dim_vec = phi::vectorize(out->dims()); int batch_size = 1; for (int i = 0; i < out_dim_vec.size() - 2; i++) { batch_size *= out_dim_vec[i]; } - int64_t out_crows_size = batch_size * (xdim_vec[x_ndims - 2] + 1); DenseTensor out_crows = phi::Empty(dev_ctx, {out_crows_size}); DenseTensor out_cols = phi::Empty(dev_ctx, {0}); DenseTensor out_values = phi::Empty(dev_ctx, {0}); - out->SetMember(out_crows, out_cols, out_values, out->dims()); + out_tmp.SetMember(out_crows, out_cols, out_values, out->dims()); auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); - sparse_blas.SPGEMM( - false, false, static_cast(1), x, y, static_cast(0), out); + sparse_blas.SPGEMM(false, + false, + static_cast(1), + x_tmp, + y_tmp, + static_cast(0), + &out_tmp); + + CastCsrKernel( + dev_ctx, out_tmp, DataType::INT64, out_tmp.values().dtype(), out); + #else #ifdef PADDLE_WITH_CUDA PADDLE_THROW(phi::errors::Unimplemented( @@ -307,6 +323,7 @@ PD_REGISTER_KERNEL(matmul_coo_coo, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } PD_REGISTER_KERNEL(matmul_csr_csr, @@ -316,6 +333,7 @@ PD_REGISTER_KERNEL(matmul_csr_csr, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR); } PD_REGISTER_KERNEL(masked_matmul_csr, diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu index 24c22af84429c..cc67cb0021ddc 100644 --- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu @@ -17,13 +17,13 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" namespace phi { namespace sparse { + __global__ void TransposeCooCudaKernel(const int64_t *x_indices_data, const int *perm, const std::size_t n_dim, @@ -36,38 +36,39 @@ __global__ void TransposeCooCudaKernel(const int64_t *x_indices_data, } } -template -__global__ void TransposeCsr2DCudaKernel(const IntT *x_crows_data, - const IntT *x_cols_data, +template +__global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data, + const int64_t *x_cols_data, const T *x_values_data, const int *perm, - const IntT *x_dims, - const IntT *out_dims, - const IntT x_nnz, - IntT *out_crows_data, - IntT *out_cols_data, + const int64_t *x_dims, + const int64_t *out_dims, + const int64_t x_nnz, + int64_t *out_crows_data, + int64_t *out_cols_data, T *out_values_data) { - IntT __index__ = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + int64_t __index__ = + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; // compute out_crows_data by x_cols_data - for (IntT i = __index__; i <= out_dims[0]; i += blockDim.x * gridDim.x) { + for (int64_t i = __index__; i <= out_dims[0]; i += blockDim.x * gridDim.x) { out_crows_data[i] = 0; } __syncthreads(); if (__index__ == 0) { - for (IntT i = 0; i < x_nnz; ++i) { + for (int64_t i = 0; i < x_nnz; ++i) { int j = x_cols_data[i]; out_crows_data[j + 2]++; } - for (IntT i = 0; i < out_dims[0]; i += 1) { + for (int64_t i = 0; i < out_dims[0]; i += 1) { out_crows_data[i + 1] += out_crows_data[i]; } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_dims[0]; ++i) { - IntT start = x_crows_data[i]; - IntT end = x_crows_data[i + 1]; - for (IntT j = start; j < end; ++j) { - IntT x_cols_j = x_cols_data[j] + 1; - IntT jjj = out_crows_data[x_cols_j]; + int64_t start = x_crows_data[i]; + int64_t end = x_crows_data[i + 1]; + for (int64_t j = start; j < end; ++j) { + int64_t x_cols_j = x_cols_data[j] + 1; + int64_t jjj = out_crows_data[x_cols_j]; out_cols_data[jjj] = i; out_values_data[jjj] = x_values_data[j]; out_crows_data[x_cols_j]++; @@ -76,19 +77,20 @@ __global__ void TransposeCsr2DCudaKernel(const IntT *x_crows_data, } } -template -__global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, - const IntT *x_cols_data, +template +__global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, + const int64_t *x_cols_data, const T *x_values_data, const int *perm, - const IntT *x_dims, - const IntT *out_dims, + const int64_t *x_dims, + const int64_t *out_dims, const std::size_t n_dim, - const IntT x_nnz, - IntT *out_crows_data, - IntT *out_cols_data, + const int64_t x_nnz, + int64_t *out_crows_data, + int64_t *out_cols_data, T *out_values_data) { - IntT __index__ = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + int64_t __index__ = + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; if (__index__ == 0) { int out_n_rows = out_dims[1]; int x_n_rows = x_dims[1]; @@ -107,11 +109,11 @@ __global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_n_rows; ++i) { - IntT start = x_crows_data[i]; - IntT end = x_crows_data[i + 1]; - for (IntT j = start; j < end; ++j) { - IntT x_cols_j = x_cols_data[j] + 1; - IntT jjj = out_crows_data[x_cols_j]; + int64_t start = x_crows_data[i]; + int64_t end = x_crows_data[i + 1]; + for (int64_t j = start; j < end; ++j) { + int64_t x_cols_j = x_cols_data[j] + 1; + int64_t jjj = out_crows_data[x_cols_j]; out_cols_data[jjj] = i; out_values_data[jjj] = x_values_data[j]; out_crows_data[x_cols_j]++; @@ -190,11 +192,11 @@ void TransposeCooKernel(const Context &dev_ctx, x_indices_data, d_perm, n_dim, x_nnz, out_indices_data); } -template -void TransposeCsrImpl(const Context &dev_ctx, - const SparseCsrTensor &x, - const std::vector &perm, - SparseCsrTensor *out) { +template +void TransposeCsrKernel(const Context &dev_ctx, + const SparseCsrTensor &x, + const std::vector &perm, + SparseCsrTensor *out) { std::size_t n_dim = perm.size(); const DenseTensor &x_crows = x.crows(); const DenseTensor &x_cols = x.cols(); @@ -211,12 +213,12 @@ void TransposeCsrImpl(const Context &dev_ctx, // create out sparse tensor DDim out_dims = x.dims().transpose(perm); if (n_dim == 2) { - out_crows = Empty(dev_ctx, {out_dims[0] + 1}); + out_crows = Empty(dev_ctx, {out_dims[0] + 1}); } else { out_crows = - Empty(dev_ctx, {out_dims[0] * (out_dims[1] + 1)}); + Empty(dev_ctx, {out_dims[0] * (out_dims[1] + 1)}); } - out_cols = EmptyLike(dev_ctx, x.cols()); + out_cols = EmptyLike(dev_ctx, x.cols()); out_values = EmptyLike(dev_ctx, x.values()); out->SetMember(out_crows, out_cols, out_values, out_dims); // transpose by two stages @@ -236,14 +238,14 @@ void TransposeCsrImpl(const Context &dev_ctx, TransposeCsrKernel(dev_ctx, temp, {2, 0, 1}, out); return; } - IntT *out_crows_data = out_crows.data(); - IntT *out_cols_data = out_cols.data(); + int64_t *out_crows_data = out_crows.data(); + int64_t *out_cols_data = out_cols.data(); T *out_values_data = out_values.data(); - const IntT *x_crows_data = x_crows.data(); - const IntT *x_cols_data = x_cols.data(); + const int64_t *x_crows_data = x_crows.data(); + const int64_t *x_cols_data = x_cols.data(); const T *x_values_data = x_values.data(); int *d_perm; - IntT *d_x_dims, *d_out_dims; + int64_t *d_x_dims, *d_out_dims; auto d_perm_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), @@ -258,71 +260,58 @@ void TransposeCsrImpl(const Context &dev_ctx, dev_ctx.stream()); auto d_x_dims_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), - sizeof(IntT) * x.dims().size(), + sizeof(int64_t) * x.dims().size(), phi::Stream(reinterpret_cast(dev_ctx.stream()))); - d_x_dims = reinterpret_cast(d_x_dims_tensor->ptr()); + d_x_dims = reinterpret_cast(d_x_dims_tensor->ptr()); memory_utils::Copy(dev_ctx.GetPlace(), d_x_dims, phi::CPUPlace(), x.dims().Get(), - sizeof(IntT) * x.dims().size(), + sizeof(int64_t) * x.dims().size(), dev_ctx.stream()); auto d_out_dims_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), - sizeof(IntT) * out_dims.size(), + sizeof(int64_t) * out_dims.size(), phi::Stream(reinterpret_cast(dev_ctx.stream()))); - d_out_dims = reinterpret_cast(d_out_dims_tensor->ptr()); + d_out_dims = reinterpret_cast(d_out_dims_tensor->ptr()); memory_utils::Copy(dev_ctx.GetPlace(), d_out_dims, phi::CPUPlace(), out_dims.Get(), - sizeof(IntT) * out_dims.size(), + sizeof(int64_t) * out_dims.size(), dev_ctx.stream()); - IntT x_nnz = static_cast(x.nnz()); + int64_t x_nnz = x.nnz(); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1); if (perm.size() == 2) { - TransposeCsr2DCudaKernel<<>>(x_crows_data, - x_cols_data, - x_values_data, - d_perm, - d_x_dims, - d_out_dims, - x_nnz, - out_crows_data, - out_cols_data, - out_values_data); + TransposeCsr2DCudaKernel<<>>(x_crows_data, + x_cols_data, + x_values_data, + d_perm, + d_x_dims, + d_out_dims, + x_nnz, + out_crows_data, + out_cols_data, + out_values_data); } else { - TransposeCsr3DCudaKernel - <<<1, 1, 0, dev_ctx.stream()>>>(x_crows_data, - x_cols_data, - x_values_data, - d_perm, - d_x_dims, - d_out_dims, - perm.size(), - x_nnz, - out_crows_data, - out_cols_data, - out_values_data); + TransposeCsr3DCudaKernel<<<1, 1, 0, dev_ctx.stream()>>>(x_crows_data, + x_cols_data, + x_values_data, + d_perm, + d_x_dims, + d_out_dims, + perm.size(), + x_nnz, + out_crows_data, + out_cols_data, + out_values_data); } } - -template -void TransposeCsrKernel(const Context &dev_ctx, - const SparseCsrTensor &x, - const std::vector &perm, - SparseCsrTensor *out) { - PD_VISIT_BASE_INTEGRAL_TYPES(x.crows().dtype(), "TransposeCsrKernel", ([&] { - TransposeCsrImpl( - dev_ctx, x, perm, out); - })); -} - } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h index f0ea90ee1f09b..723bf3d2697cf 100644 --- a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h +++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h @@ -25,7 +25,6 @@ #include "paddle/phi/kernels/isfinite_kernel.h" #include "paddle/phi/kernels/scale_kernel.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" -#include "paddle/phi/kernels/trunc_kernel.h" namespace phi { namespace sparse { @@ -202,6 +201,8 @@ void CastCsrKernel(const Context& dev_ctx, meta.set_dims(x_values.dims()); phi::CastKernel(dev_ctx, x_values, value_dtype, out_values); } + + out->set_dims(x.dims()); } template diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index 8ba0223e8112d..14c45500857fa 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -107,20 +107,7 @@ def check_result(self, x_shape, y_shape): dense_out = paddle.matmul(dense_x, dense_y) sp_x = origin_x.detach().to_sparse_csr() - # only support 32-bit index. - sp_x_crows = paddle.cast(sp_x.crows(), "int32") - sp_x_cols = paddle.cast(sp_x.cols(), "int32") - sp_x = paddle.sparse.sparse_csr_tensor( - sp_x_crows, sp_x_cols, sp_x.values(), sp_x.shape - ) - sp_y = origin_y.detach().to_sparse_csr() - # only support 32-bit index. - sp_y_crows = paddle.cast(sp_y.crows(), "int32") - sp_y_cols = paddle.cast(sp_y.cols(), "int32") - sp_y = paddle.sparse.sparse_csr_tensor( - sp_y_crows, sp_y_cols, sp_y.values(), sp_y.shape - ) sp_x.stop_gradient = False sp_y.stop_gradient = False @@ -171,19 +158,7 @@ def check_result(self, x_shape, y_shape): dense_out = paddle.matmul(dense_x, dense_y) sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) - - # only support 32-bit index. - sp_x_indices = paddle.cast(sp_x.indices(), "int32") - sp_x = paddle.sparse.sparse_coo_tensor( - sp_x_indices, sp_x.values(), sp_x.shape - ) - sp_y = origin_y.detach().to_sparse_coo(len(y_shape)) - # only support 32-bit index. - sp_y_indices = paddle.cast(sp_y.indices(), "int32") - sp_y = paddle.sparse.sparse_coo_tensor( - sp_y_indices, sp_y.values(), sp_y.shape - ) sp_x.stop_gradient = False sp_y.stop_gradient = False From 253ca38449e9388f1c33021dc58dad675cb433cf Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Sat, 30 Dec 2023 07:54:06 +0000 Subject: [PATCH 14/25] optimize int32 index --- paddle/fluid/platform/dynload/cusparse.h | 1 + paddle/phi/backends/dynload/cusparse.h | 1 + .../funcs/sparse/sparse_blas_impl.cu.h | 131 ++++++++++++++++-- .../kernels/sparse/gpu/matmul_grad_kernel.cu | 62 ++------- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 34 +---- .../kernels/sparse/gpu/transpose_kernel.cu | 89 +++++++----- test/legacy_test/test_sparse_matmul_op.py | 66 +++++++-- 7 files changed, 240 insertions(+), 144 deletions(-) diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index cbc09ed47c1bf..f15445618d867 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -51,6 +51,7 @@ namespace dynload { __macro(cusparseSpMV_bufferSize); \ __macro(cusparseSpMV); \ __macro(cusparseSpMatGetSize); \ + __macro(cusparseCsrSetPointers); \ __macro(cusparseSpGEMM_createDescr); \ __macro(cusparseSpGEMM_compute); \ __macro(cusparseSpGEMM_workEstimation); \ diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h index 6d90cb5eefdca..d75b236c07ab1 100644 --- a/paddle/phi/backends/dynload/cusparse.h +++ b/paddle/phi/backends/dynload/cusparse.h @@ -63,6 +63,7 @@ extern void *cusparse_dso_handle; __macro(cusparseSpMV_bufferSize); \ __macro(cusparseSpMV); \ __macro(cusparseSpMatGetSize); \ + __macro(cusparseCsrSetPointers); \ __macro(cusparseSpGEMM_createDescr); \ __macro(cusparseSpGEMM_compute); \ __macro(cusparseSpGEMM_workEstimation); \ diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 6474b34e7284a..93a05c419bf64 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -26,6 +26,7 @@ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" namespace phi { @@ -493,6 +494,80 @@ void SparseBlas::SDDMM(bool transa, #endif /************* SPARSE*SPARSE->SPARSE MATMUL ************/ +template +class CuSparseSpGEMMCsrDescriptor { + public: + explicit CuSparseSpGEMMCsrDescriptor(const phi::SparseCsrTensor& x, + const phi::GPUContext& dev_ctx) + : dev_ctx_(dev_ctx) { + std::vector xdim_vec = phi::vectorize(x.dims()); + auto x_ndims = xdim_vec.size(); + + int64_t M = xdim_vec[x_ndims - 2]; + int64_t N = xdim_vec[x_ndims - 1]; + int batch_size = 1; + for (int i = 0; i < x_ndims - 2; i++) { + batch_size *= xdim_vec[i]; + } + + const int32_t *crows_data, *cols_data; + if (x.crows().dtype() == phi::DataType::INT32) { + crows_data = x.crows().data(); + cols_data = x.cols().data(); + } else { + phi::MetaTensor crows_meta(&crows_int); + crows_meta.set_dims(x.crows().dims()); + + phi::MetaTensor cols_meta(&cols_int); + cols_meta.set_dims(x.cols().dims()); + + phi::CastKernel( + dev_ctx, x.crows(), phi::DataType::INT32, &crows_int); + phi::CastKernel( + dev_ctx, x.cols(), phi::DataType::INT32, &cols_int); + + crows_data = crows_int.data(); + cols_data = cols_int.data(); + } + + const T* values_data = x.values().data(); + int64_t batch_nnz = x.nnz() / batch_size; + cudaDataType_t gpu_type = GetGpuDataType(); + dev_ctx.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCreateCsr(&descriptor_, + M, + N, + batch_nnz, + const_cast(crows_data), + const_cast(cols_data), + const_cast(values_data), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + }); + + VLOG(6) << "Create csr cusparseSpMatDescr_t " << &descriptor_; + } + + ~CuSparseSpGEMMCsrDescriptor() { + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseDestroySpMat(descriptor_); + }); + VLOG(6) << "Destroy cusparseSpMatDescr_t " << &descriptor_; + } + + const cusparseSpMatDescr_t& descriptor() const { return descriptor_; } + + private: + const phi::GPUContext& dev_ctx_; + cusparseSpMatDescr_t descriptor_; + + // temporarily save crows and cols for int64_t index csr + DenseTensor crows_int; + DenseTensor cols_int; +}; + template <> template void SparseBlas::SPGEMM(bool transa, @@ -502,9 +577,26 @@ void SparseBlas::SPGEMM(bool transa, const SparseCsrTensor& mat_b, T beta, SparseCsrTensor* mat_out) const { - auto a_descriptor = CuSparseSpMatDescriptor(mat_a, dev_ctx_); - auto b_descriptor = CuSparseSpMatDescriptor(mat_b, dev_ctx_); - auto out_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + DenseTensor* mat_out_crows = mat_out->mutable_crows(); + DenseTensor* mat_out_cols = mat_out->mutable_cols(); + DenseTensor* mat_out_values = mat_out->mutable_values(); + + MetaTensor out_crows_meta(mat_out_crows); + out_crows_meta.set_dtype(phi::DataType::INT32); + out_crows_meta.set_dims(mat_a.crows().dims()); + dev_ctx_.template Alloc(mat_out_crows); + + MetaTensor out_cols_meta(mat_out_cols); + out_cols_meta.set_dtype(phi::DataType::INT32); + dev_ctx_.template Alloc(mat_out_cols); + + MetaTensor out_values_meta(mat_out_values); + out_values_meta.set_dtype(mat_a.values().dtype()); + dev_ctx_.template Alloc(mat_out_values); + + auto a_descriptor = CuSparseSpGEMMCsrDescriptor(mat_a, dev_ctx_); + auto b_descriptor = CuSparseSpGEMMCsrDescriptor(mat_b, dev_ctx_); + auto out_descriptor = CuSparseSpGEMMCsrDescriptor(*mat_out, dev_ctx_); cudaDataType_t gpu_type = GetGpuDataType(); size_t buffer_a_size = 0, buffer_b_size = 0; @@ -587,14 +679,25 @@ void SparseBlas::SPGEMM(bool transa, tmp_buffer_b_ptr); }); - int64_t num_rows, num_cols, nnz; + int64_t out_crows_size, out_cols_size, out_values_size; dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpMatGetSize( - out_descriptor.descriptor(), &num_rows, &num_cols, &nnz); + phi::dynload::cusparseSpMatGetSize(out_descriptor.descriptor(), + &out_crows_size, + &out_cols_size, + &out_values_size); }); - *(mat_out->mutable_cols()) = phi::Empty(dev_ctx_, {nnz}); - *(mat_out->mutable_values()) = phi::Empty(dev_ctx_, {nnz}); - auto res_descriptor = CuSparseSpMatDescriptor(*mat_out, dev_ctx_); + + // Reallocate space for cols and values of mat_out + mat_out_cols->Resize(make_dim(out_values_size)); + dev_ctx_.template Alloc(mat_out_cols); + mat_out_values->Resize(make_dim(out_values_size)); + dev_ctx_.template Alloc(mat_out_values); + + phi::dynload::cusparseCsrSetPointers( + out_descriptor.descriptor(), + const_cast(mat_out_crows->data()), + const_cast(mat_out_cols->data()), + const_cast(mat_out_values->data())); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_copy(handle, @@ -604,12 +707,20 @@ void SparseBlas::SPGEMM(bool transa, a_descriptor.descriptor(), b_descriptor.descriptor(), &beta, - res_descriptor.descriptor(), + out_descriptor.descriptor(), gpu_type, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc); }); + if (mat_a.crows().dtype() == phi::DataType::INT64 || + mat_b.crows().dtype() == phi::DataType::INT64) { + phi::CastKernel( + dev_ctx_, *mat_out_crows, phi::DataType::INT64, mat_out_crows); + phi::CastKernel( + dev_ctx_, *mat_out_cols, phi::DataType::INT64, mat_out_cols); + } + phi::dynload::cusparseSpGEMM_destroyDescr(spgemmDesc); } } // namespace sparse diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index b78e96c891b91..702fa650e137f 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -159,66 +159,26 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, perm = {0, 2, 1}; } - // cusparseSpGEMM only support 32-bit index. - SparseCsrTensor dout_tmp; - CastCsrKernel( - dev_ctx, dout, DataType::INT32, dout.values().dtype(), &dout_tmp); - // dx{SparseCsr} = dout{SparseCsr} * y'{SparseCsr} if (dx) { - SparseCsrTensor x_tmp, dx_tmp; - CastCsrKernel( - dev_ctx, x, DataType::INT32, x.values().dtype(), &x_tmp); - - EmptyLikeCsrKernel(dev_ctx, x_tmp, &dx_tmp); - // cusparseSpGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. - SparseCsrTensor trans_y, trans_y_tmp; + // transopse y before cusparseSpGEMM computation. + SparseCsrTensor trans_y; TransposeCsrKernel(dev_ctx, y, perm, &trans_y); - CastCsrKernel(dev_ctx, - trans_y, - DataType::INT32, - trans_y.values().dtype(), - &trans_y_tmp); - - sparse_blas.SPGEMM(false, - false, - static_cast(1), - dout_tmp, - trans_y_tmp, - static_cast(0), - &dx_tmp); - - CastCsrKernel( - dev_ctx, dx_tmp, DataType::INT64, dx_tmp.values().dtype(), dx); + + sparse_blas.SPGEMM( + false, false, static_cast(1), dout, trans_y, static_cast(0), dx); } // dy{SparseCsr} = x'{SparseCsr} * dout{SparseCsr} if (dy) { - SparseCsrTensor y_tmp, dy_tmp; - CastCsrKernel( - dev_ctx, y, DataType::INT32, y.values().dtype(), &y_tmp); - EmptyLikeCsrKernel(dev_ctx, y_tmp, &dy_tmp); - // cusparseSpGEMM only support CUSPARSE_OPERATION_NON_TRANSPOSE. - SparseCsrTensor trans_x, trans_x_tmp; + // transopse x before cusparseSpGEMM computation. + SparseCsrTensor trans_x; TransposeCsrKernel(dev_ctx, x, perm, &trans_x); - CastCsrKernel(dev_ctx, - trans_x, - DataType::INT32, - trans_x.values().dtype(), - &trans_x_tmp); - - sparse_blas.SPGEMM(false, - false, - static_cast(1), - trans_x_tmp, - dout_tmp, - static_cast(0), - &dy_tmp); - - CastCsrKernel( - dev_ctx, dy_tmp, DataType::INT64, dy_tmp.values().dtype(), dy); + + sparse_blas.SPGEMM( + false, false, static_cast(1), trans_x, dout, static_cast(0), dy); } #else #ifdef PADDLE_WITH_CUDA @@ -236,7 +196,7 @@ void MatmulCooCooGradKernel(const Context& dev_ctx, const SparseCooTensor& dout, SparseCooTensor* dx, SparseCooTensor* dy) { - // cusparseSpGEMM only support CSR now, so use COO->CSR->COO + // cusparseSpGEMM only support CSR now, so use COO->CSR->COO. SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); SparseCsrTensor dout_csr = CooToCsr(dev_ctx, dout); diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 3216dbcaadbaa..7965ae6a8dcba 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -159,40 +159,14 @@ void MatmulCsrCsrKernel(const Context& dev_ctx, "The shape of Input(x) and Input(y) is not suitable for matmul " "opetation, x_dim[-1] must be eaqual to y_dim[-2].")); - // cusparseSpGEMM only support 32-bit index. - SparseCsrTensor x_tmp, y_tmp, out_tmp; - CastCsrKernel( - dev_ctx, x, DataType::INT32, x.values().dtype(), &x_tmp); - CastCsrKernel( - dev_ctx, y, DataType::INT32, y.values().dtype(), &y_tmp); - - std::vector out_dim_vec = phi::vectorize(out->dims()); - int batch_size = 1; - for (int i = 0; i < out_dim_vec.size() - 2; i++) { - batch_size *= out_dim_vec[i]; - } - int64_t out_crows_size = batch_size * (xdim_vec[x_ndims - 2] + 1); - DenseTensor out_crows = phi::Empty(dev_ctx, {out_crows_size}); - DenseTensor out_cols = phi::Empty(dev_ctx, {0}); - DenseTensor out_values = phi::Empty(dev_ctx, {0}); - out_tmp.SetMember(out_crows, out_cols, out_values, out->dims()); - auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); - sparse_blas.SPGEMM(false, - false, - static_cast(1), - x_tmp, - y_tmp, - static_cast(0), - &out_tmp); - - CastCsrKernel( - dev_ctx, out_tmp, DataType::INT64, out_tmp.values().dtype(), out); + sparse_blas.SPGEMM( + false, false, static_cast(1), x, y, static_cast(0), out); #else #ifdef PADDLE_WITH_CUDA PADDLE_THROW(phi::errors::Unimplemented( - "forward of 'sparse.matmul' use cusparseSPGEMM, " + "forward of 'sparse.matmul' use cusparseSpGEMM, " "which is supported from CUDA 11.0")); #endif #endif @@ -203,7 +177,7 @@ void MatmulCooCooKernel(const Context& dev_ctx, const SparseCooTensor& x, const SparseCooTensor& y, SparseCooTensor* out) { - // 'cusparseSPGEMM' only support CSR now, so use COO->CSR->COO, + // 'cusparseSPGEMM' only support CSR now, so use COO->CSR->COO. SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); SparseCsrTensor out_csr; diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu index cc67cb0021ddc..e922aea7809f5 100644 --- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu @@ -17,6 +17,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" @@ -36,16 +37,16 @@ __global__ void TransposeCooCudaKernel(const int64_t *x_indices_data, } } -template -__global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data, - const int64_t *x_cols_data, +template +__global__ void TransposeCsr2DCudaKernel(const IntT *x_crows_data, + const IntT *x_cols_data, const T *x_values_data, const int *perm, const int64_t *x_dims, const int64_t *out_dims, const int64_t x_nnz, - int64_t *out_crows_data, - int64_t *out_cols_data, + IntT *out_crows_data, + IntT *out_cols_data, T *out_values_data) { int64_t __index__ = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; @@ -56,16 +57,16 @@ __global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data, __syncthreads(); if (__index__ == 0) { for (int64_t i = 0; i < x_nnz; ++i) { - int j = x_cols_data[i]; + IntT j = x_cols_data[i]; out_crows_data[j + 2]++; } - for (int64_t i = 0; i < out_dims[0]; i += 1) { + for (int i = 0; i < out_dims[0]; i += 1) { out_crows_data[i + 1] += out_crows_data[i]; } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_dims[0]; ++i) { - int64_t start = x_crows_data[i]; - int64_t end = x_crows_data[i + 1]; + IntT start = x_crows_data[i]; + IntT end = x_crows_data[i + 1]; for (int64_t j = start; j < end; ++j) { int64_t x_cols_j = x_cols_data[j] + 1; int64_t jjj = out_crows_data[x_cols_j]; @@ -77,17 +78,17 @@ __global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data, } } -template -__global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, - const int64_t *x_cols_data, +template +__global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, + const IntT *x_cols_data, const T *x_values_data, const int *perm, const int64_t *x_dims, const int64_t *out_dims, const std::size_t n_dim, const int64_t x_nnz, - int64_t *out_crows_data, - int64_t *out_cols_data, + IntT *out_crows_data, + IntT *out_cols_data, T *out_values_data) { int64_t __index__ = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; @@ -109,8 +110,8 @@ __global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_n_rows; ++i) { - int64_t start = x_crows_data[i]; - int64_t end = x_crows_data[i + 1]; + IntT start = x_crows_data[i]; + IntT end = x_crows_data[i + 1]; for (int64_t j = start; j < end; ++j) { int64_t x_cols_j = x_cols_data[j] + 1; int64_t jjj = out_crows_data[x_cols_j]; @@ -130,9 +131,9 @@ __global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data, int x_cols_offset = 0; int out_cols_index = 0; for (int i = 0; i < x_dims[0]; ++i) { - int x_crows_index = i * (x_n_rows + 1); - int start = x_crows_data[x_crows_index + k]; - int end = x_crows_data[x_crows_index + 1 + k]; + IntT x_crows_index = i * (x_n_rows + 1); + IntT start = x_crows_data[x_crows_index + k]; + IntT end = x_crows_data[x_crows_index + 1 + k]; out_crows_data[i + 1] = end - start; for (int j = start; j < end; ++j) { out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j]; @@ -192,11 +193,11 @@ void TransposeCooKernel(const Context &dev_ctx, x_indices_data, d_perm, n_dim, x_nnz, out_indices_data); } -template -void TransposeCsrKernel(const Context &dev_ctx, - const SparseCsrTensor &x, - const std::vector &perm, - SparseCsrTensor *out) { +template +void TransposeCsrGpuKernel(const GPUContext &dev_ctx, + const SparseCsrTensor &x, + const std::vector &perm, + SparseCsrTensor *out) { std::size_t n_dim = perm.size(); const DenseTensor &x_crows = x.crows(); const DenseTensor &x_cols = x.cols(); @@ -213,36 +214,36 @@ void TransposeCsrKernel(const Context &dev_ctx, // create out sparse tensor DDim out_dims = x.dims().transpose(perm); if (n_dim == 2) { - out_crows = Empty(dev_ctx, {out_dims[0] + 1}); + out_crows = Empty(dev_ctx, {out_dims[0] + 1}); } else { out_crows = - Empty(dev_ctx, {out_dims[0] * (out_dims[1] + 1)}); + Empty(dev_ctx, {out_dims[0] * (out_dims[1] + 1)}); } - out_cols = EmptyLike(dev_ctx, x.cols()); - out_values = EmptyLike(dev_ctx, x.values()); + out_cols = EmptyLike(dev_ctx, x.cols()); + out_values = EmptyLike(dev_ctx, x.values()); out->SetMember(out_crows, out_cols, out_values, out_dims); // transpose by two stages if (perm[0] == 1 && perm[1] == 2) { // perm == {1, 2, 0} SparseCsrTensor temp; - TransposeCsrKernel(dev_ctx, x, {1, 0, 2}, &temp); - TransposeCsrKernel(dev_ctx, temp, {0, 2, 1}, out); + TransposeCsrKernel(dev_ctx, x, {1, 0, 2}, &temp); + TransposeCsrKernel(dev_ctx, temp, {0, 2, 1}, out); return; } else if (perm[0] == 2 && perm[1] == 0) { // perm == {2, 0, 1} SparseCsrTensor temp; - TransposeCsrKernel(dev_ctx, x, {0, 2, 1}, &temp); - TransposeCsrKernel(dev_ctx, temp, {1, 0, 2}, out); + TransposeCsrKernel(dev_ctx, x, {0, 2, 1}, &temp); + TransposeCsrKernel(dev_ctx, temp, {1, 0, 2}, out); return; } else if (perm[0] == 2 && perm[1] == 1) { // perm == {2, 1, 0} SparseCsrTensor temp; - TransposeCsrKernel(dev_ctx, x, {1, 0, 2}, &temp); - TransposeCsrKernel(dev_ctx, temp, {2, 0, 1}, out); + TransposeCsrKernel(dev_ctx, x, {1, 0, 2}, &temp); + TransposeCsrKernel(dev_ctx, temp, {2, 0, 1}, out); return; } - int64_t *out_crows_data = out_crows.data(); - int64_t *out_cols_data = out_cols.data(); + IntT *out_crows_data = out_crows.data(); + IntT *out_cols_data = out_cols.data(); T *out_values_data = out_values.data(); - const int64_t *x_crows_data = x_crows.data(); - const int64_t *x_cols_data = x_cols.data(); + const IntT *x_crows_data = x_crows.data(); + const IntT *x_cols_data = x_cols.data(); const T *x_values_data = x_values.data(); int *d_perm; int64_t *d_x_dims, *d_out_dims; @@ -312,6 +313,18 @@ void TransposeCsrKernel(const Context &dev_ctx, out_values_data); } } + +template +void TransposeCsrKernel(const Context &dev_ctx, + const SparseCsrTensor &x, + const std::vector &perm, + SparseCsrTensor *out) { + PD_VISIT_BASE_INTEGRAL_TYPES(x.crows().dtype(), "TransposeCsrKernel", ([&] { + TransposeCsrGpuKernel( + dev_ctx, x, perm, out); + })); +} + } // namespace sparse } // namespace phi diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index 14c45500857fa..3d9ff843fd113 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -93,9 +93,9 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmulCsrCsr(unittest.TestCase): - # x: csr sparse, y: csr sparse, out: csr sparse - def check_result(self, x_shape, y_shape): +class TestMatmulSparseSparseIndex64(unittest.TestCase): + # x: sparse, y: sparse, out: sparse + def check_result(self, x_shape, y_shape, format): mask = paddle.randint(0, 2, x_shape) origin_x = paddle.rand(x_shape) * mask origin_y = paddle.rand(y_shape) @@ -106,8 +106,12 @@ def check_result(self, x_shape, y_shape): dense_y.stop_gradient = False dense_out = paddle.matmul(dense_x, dense_y) - sp_x = origin_x.detach().to_sparse_csr() - sp_y = origin_y.detach().to_sparse_csr() + if format == "coo": + sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) + sp_y = origin_y.detach().to_sparse_coo(len(y_shape)) + else: + sp_x = origin_x.detach().to_sparse_csr() + sp_y = origin_y.detach().to_sparse_csr() sp_x.stop_gradient = False sp_y.stop_gradient = False @@ -134,19 +138,21 @@ def check_result(self, x_shape, y_shape): "only support cuda>=11.0", ) def test_matmul_2d(self): - self.check_result([16, 12], [12, 10]) + self.check_result([16, 12], [12, 10], 'coo') + self.check_result([16, 12], [12, 10], 'csr') @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, "only support cuda>=11.8", ) def test_matmul_3d(self): - self.check_result([8, 16, 12], [8, 12, 10]) + self.check_result([8, 16, 12], [8, 12, 10], 'coo') + self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmulCooCoo(unittest.TestCase): - # x: coo sparse, y: coo sparse, out: coo sparse - def check_result(self, x_shape, y_shape): +class TestMatmulSparseSparseIndex32(unittest.TestCase): + # x: sparse, y: sparse, out: sparse + def check_result(self, x_shape, y_shape, format): mask = paddle.randint(0, 2, x_shape) origin_x = paddle.rand(x_shape) * mask origin_y = paddle.rand(y_shape) @@ -157,17 +163,45 @@ def check_result(self, x_shape, y_shape): dense_y.stop_gradient = False dense_out = paddle.matmul(dense_x, dense_y) - sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) - sp_y = origin_y.detach().to_sparse_coo(len(y_shape)) + if format == "coo": + sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) + # cast to 32-bit index. + sp_x_indices = paddle.cast(sp_x.indices(), "int32") + sp_x = paddle.sparse.sparse_coo_tensor( + sp_x_indices, sp_x.values(), sp_x.shape + ) + + sp_y = origin_y.detach().to_sparse_coo(len(y_shape)) + # cast to 32-bit index. + sp_y_indices = paddle.cast(sp_y.indices(), "int32") + sp_y = paddle.sparse.sparse_coo_tensor( + sp_y_indices, sp_y.values(), sp_y.shape + ) + else: + sp_x = origin_x.detach().to_sparse_csr() + # cast to 32-bit index. + sp_x_crows = paddle.cast(sp_x.crows(), "int32") + sp_x_cols = paddle.cast(sp_x.cols(), "int32") + sp_x = paddle.sparse.sparse_csr_tensor( + sp_x_crows, sp_x_cols, sp_x.values(), sp_x.shape + ) + + sp_y = origin_y.detach().to_sparse_csr() + # cast to 32-bit index. + sp_y_crows = paddle.cast(sp_y.crows(), "int32") + sp_y_cols = paddle.cast(sp_y.cols(), "int32") + sp_y = paddle.sparse.sparse_csr_tensor( + sp_y_crows, sp_y_cols, sp_y.values(), sp_y.shape + ) sp_x.stop_gradient = False sp_y.stop_gradient = False sp_out = paddle.sparse.matmul(sp_x, sp_y) + np.testing.assert_allclose( sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 ) - if get_cuda_version() >= 11030: dense_out.backward() sp_out.backward() @@ -185,14 +219,16 @@ def check_result(self, x_shape, y_shape): "only support cuda>=11.0", ) def test_matmul_2d(self): - self.check_result([16, 12], [12, 10]) + self.check_result([16, 12], [12, 10], 'coo') + self.check_result([16, 12], [12, 10], 'csr') @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, "only support cuda>=11.8", ) def test_matmul_3d(self): - self.check_result([8, 16, 12], [8, 12, 10]) + self.check_result([8, 16, 12], [8, 12, 10], 'coo') + self.check_result([8, 16, 12], [8, 12, 10], 'csr') class TestMaskedMatmul(unittest.TestCase): From 8dfe895679da9c4d762175602bb82c5b0ba779db Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Sat, 30 Dec 2023 08:35:57 +0000 Subject: [PATCH 15/25] fix --- paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index cc3189dc81b69..d0250035cb432 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -688,9 +688,9 @@ void SparseBlas::SPGEMM(bool transa, }); // Reallocate space for cols and values of mat_out - mat_out_cols->Resize(make_dim(out_values_size)); + mat_out_cols->Resize(common::make_dim(out_values_size)); dev_ctx_.template Alloc(mat_out_cols); - mat_out_values->Resize(make_dim(out_values_size)); + mat_out_values->Resize(common::make_dim(out_values_size)); dev_ctx_.template Alloc(mat_out_values); phi::dynload::cusparseCsrSetPointers( From 9c80efec77202f0d0cb788cb2eb5abf56fc8b955 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Sun, 31 Dec 2023 10:16:11 +0000 Subject: [PATCH 16/25] fix --- .../funcs/sparse/sparse_blas_impl.cu.h | 10 ++++++++++ .../kernels/sparse/gpu/transpose_kernel.cu | 20 +++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index d0250035cb432..b4b94ac37a400 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -547,6 +547,16 @@ class CuSparseSpGEMMCsrDescriptor { gpu_type); }); +#if CUDA_VERSION >= 11080 + dev_ctx.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCsrSetStridedBatch( + descriptor_, batch_size, M + 1, batch_nnz); + }); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is " + "supported from CUDA 11.8")); +#endif VLOG(6) << "Create csr cusparseSpMatDescr_t " << &descriptor_; } diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu index e922aea7809f5..ac11b64cd0229 100644 --- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu @@ -60,16 +60,16 @@ __global__ void TransposeCsr2DCudaKernel(const IntT *x_crows_data, IntT j = x_cols_data[i]; out_crows_data[j + 2]++; } - for (int i = 0; i < out_dims[0]; i += 1) { + for (int64_t i = 0; i < out_dims[0]; i += 1) { out_crows_data[i + 1] += out_crows_data[i]; } // compute out_cols_data and out_values_data by out_crows_data and x for (int i = 0; i < x_dims[0]; ++i) { IntT start = x_crows_data[i]; IntT end = x_crows_data[i + 1]; - for (int64_t j = start; j < end; ++j) { - int64_t x_cols_j = x_cols_data[j] + 1; - int64_t jjj = out_crows_data[x_cols_j]; + for (IntT j = start; j < end; ++j) { + IntT x_cols_j = x_cols_data[j] + 1; + IntT jjj = out_crows_data[x_cols_j]; out_cols_data[jjj] = i; out_values_data[jjj] = x_values_data[j]; out_crows_data[x_cols_j]++; @@ -112,9 +112,9 @@ __global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, for (int i = 0; i < x_n_rows; ++i) { IntT start = x_crows_data[i]; IntT end = x_crows_data[i + 1]; - for (int64_t j = start; j < end; ++j) { - int64_t x_cols_j = x_cols_data[j] + 1; - int64_t jjj = out_crows_data[x_cols_j]; + for (IntT j = start; j < end; ++j) { + IntT x_cols_j = x_cols_data[j] + 1; + IntT jjj = out_crows_data[x_cols_j]; out_cols_data[jjj] = i; out_values_data[jjj] = x_values_data[j]; out_crows_data[x_cols_j]++; @@ -131,9 +131,9 @@ __global__ void TransposeCsr3DCudaKernel(const IntT *x_crows_data, int x_cols_offset = 0; int out_cols_index = 0; for (int i = 0; i < x_dims[0]; ++i) { - IntT x_crows_index = i * (x_n_rows + 1); - IntT start = x_crows_data[x_crows_index + k]; - IntT end = x_crows_data[x_crows_index + 1 + k]; + int x_crows_index = i * (x_n_rows + 1); + int start = x_crows_data[x_crows_index + k]; + int end = x_crows_data[x_crows_index + 1 + k]; out_crows_data[i + 1] = end - start; for (int j = start; j < end; ++j) { out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j]; From 477dc5c14ad8fc3d40cfa2ef11c44f0eb3056552 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 2 Jan 2024 06:32:27 +0000 Subject: [PATCH 17/25] fix cuda<11.8 --- .../funcs/sparse/sparse_blas_impl.cu.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index b4b94ac37a400..73c777ef42198 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -546,18 +546,19 @@ class CuSparseSpGEMMCsrDescriptor { CUSPARSE_INDEX_BASE_ZERO, gpu_type); }); - + if (batch_size > 1) { #if CUDA_VERSION >= 11080 - dev_ctx.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseCsrSetStridedBatch( - descriptor_, batch_size, M + 1, batch_nnz); - }); + dev_ctx.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCsrSetStridedBatch( + descriptor_, batch_size, M + 1, batch_nnz); + }); #else - PADDLE_THROW(phi::errors::Unimplemented( - "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is " - "supported from CUDA 11.8")); + PADDLE_THROW(phi::errors::Unimplemented( + "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is " + "supported from CUDA 11.8")); #endif - VLOG(6) << "Create csr cusparseSpMatDescr_t " << &descriptor_; + VLOG(6) << "Create csr cusparseSpMatDescr_t " << &descriptor_; + } } ~CuSparseSpGEMMCsrDescriptor() { From ea71c64204379d95d59b50697ade7f20938b0f35 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Wed, 3 Jan 2024 12:39:41 +0000 Subject: [PATCH 18/25] ci --- paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 73c777ef42198..4f87c3c4ba874 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -614,6 +614,7 @@ void SparseBlas::SPGEMM(bool transa, cusparseSpGEMMDescr_t spgemmDesc; phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_workEstimation(handle, GetTransposeOperation(transa), From 4397dbb5e8f16752a8b53bd4118b3cfa118f0bfd Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 9 Jan 2024 16:07:01 +0000 Subject: [PATCH 19/25] fix batched computation --- .../funcs/sparse/sparse_blas_impl.cu.h | 478 +++++++++++------- .../kernels/sparse/gpu/matmul_grad_kernel.cu | 14 +- test/legacy_test/test_sparse_matmul_op.py | 59 +-- 3 files changed, 323 insertions(+), 228 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 4f87c3c4ba874..0ca2f5256b3b6 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -27,6 +27,7 @@ #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/concat_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" namespace phi { @@ -495,89 +496,12 @@ void SparseBlas::SDDMM(bool transa, /************* SPARSE*SPARSE->SPARSE MATMUL ************/ template -class CuSparseSpGEMMCsrDescriptor { - public: - explicit CuSparseSpGEMMCsrDescriptor(const phi::SparseCsrTensor& x, - const phi::GPUContext& dev_ctx) - : dev_ctx_(dev_ctx) { - std::vector xdim_vec = phi::vectorize(x.dims()); - auto x_ndims = xdim_vec.size(); - - int64_t M = xdim_vec[x_ndims - 2]; - int64_t N = xdim_vec[x_ndims - 1]; - int batch_size = 1; - for (int i = 0; i < x_ndims - 2; i++) { - batch_size *= xdim_vec[i]; - } - - const int32_t *crows_data, *cols_data; - if (x.crows().dtype() == phi::DataType::INT32) { - crows_data = x.crows().data(); - cols_data = x.cols().data(); - } else { - phi::MetaTensor crows_meta(&crows_int); - crows_meta.set_dims(x.crows().dims()); - - phi::MetaTensor cols_meta(&cols_int); - cols_meta.set_dims(x.cols().dims()); - - phi::CastKernel( - dev_ctx, x.crows(), phi::DataType::INT32, &crows_int); - phi::CastKernel( - dev_ctx, x.cols(), phi::DataType::INT32, &cols_int); - - crows_data = crows_int.data(); - cols_data = cols_int.data(); - } - - const T* values_data = x.values().data(); - int64_t batch_nnz = x.nnz() / batch_size; - cudaDataType_t gpu_type = GetGpuDataType(); - dev_ctx.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseCreateCsr(&descriptor_, - M, - N, - batch_nnz, - const_cast(crows_data), - const_cast(cols_data), - const_cast(values_data), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - gpu_type); - }); - if (batch_size > 1) { -#if CUDA_VERSION >= 11080 - dev_ctx.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseCsrSetStridedBatch( - descriptor_, batch_size, M + 1, batch_nnz); - }); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is " - "supported from CUDA 11.8")); -#endif - VLOG(6) << "Create csr cusparseSpMatDescr_t " << &descriptor_; - } - } - - ~CuSparseSpGEMMCsrDescriptor() { - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseDestroySpMat(descriptor_); - }); - VLOG(6) << "Destroy cusparseSpMatDescr_t " << &descriptor_; - } - - const cusparseSpMatDescr_t& descriptor() const { return descriptor_; } - - private: - const phi::GPUContext& dev_ctx_; - cusparseSpMatDescr_t descriptor_; - - // temporarily save crows and cols for int64_t index csr - DenseTensor crows_int; - DenseTensor cols_int; -}; +__global__ void GetBatchNNZ(const int32_t* crow_data, + int64_t rows, + int32_t* batch_nnz) { + int64_t i = static_cast(threadIdx.x); + batch_nnz[i] = crow_data[(i + 1) * (rows + 1) - 1]; +} template <> template @@ -605,125 +529,299 @@ void SparseBlas::SPGEMM(bool transa, out_values_meta.set_dtype(mat_a.values().dtype()); dev_ctx_.template Alloc(mat_out_values); - auto a_descriptor = CuSparseSpGEMMCsrDescriptor(mat_a, dev_ctx_); - auto b_descriptor = CuSparseSpGEMMCsrDescriptor(mat_b, dev_ctx_); - auto out_descriptor = CuSparseSpGEMMCsrDescriptor(*mat_out, dev_ctx_); + std::vector a_dim_vec = common::vectorize(mat_a.dims()); + auto a_ndims = a_dim_vec.size(); + const int64_t a_rows = a_dim_vec[a_ndims - 2]; + const int64_t a_cols = a_dim_vec[a_ndims - 1]; + int a_batch_size = 1; + for (int i = 0; i < a_ndims - 2; i++) { + a_batch_size *= a_dim_vec[i]; + } + + std::vector b_dim_vec = common::vectorize(mat_b.dims()); + auto b_ndims = b_dim_vec.size(); + const int64_t b_rows = b_dim_vec[b_ndims - 2]; + const int64_t b_cols = b_dim_vec[b_ndims - 1]; + int b_batch_size = 1; + for (int i = 0; i < b_ndims - 2; i++) { + b_batch_size *= b_dim_vec[i]; + } + + const int batch_size = a_batch_size; + + // cusparseSpGEMM only support 32-bit indices. + DenseTensor a_crows_int, a_cols_int, b_crows_int, b_cols_int; + const int32_t *a_crows_data, *a_cols_data, *b_crows_data, *b_cols_data; + if (mat_a.crows().dtype() == phi::DataType::INT32) { + a_crows_data = mat_a.crows().data(); + a_cols_data = mat_a.cols().data(); + } else { + phi::MetaTensor crows_meta(&a_crows_int); + crows_meta.set_dims(mat_a.crows().dims()); + phi::MetaTensor cols_meta(&a_cols_int); + cols_meta.set_dims(mat_a.cols().dims()); + + phi::CastKernel( + dev_ctx_, mat_a.crows(), phi::DataType::INT32, &a_crows_int); + phi::CastKernel( + dev_ctx_, mat_a.cols(), phi::DataType::INT32, &a_cols_int); + + a_crows_data = a_crows_int.data(); + a_cols_data = a_cols_int.data(); + } + + if (mat_b.crows().dtype() == phi::DataType::INT32) { + b_crows_data = mat_b.crows().data(); + b_cols_data = mat_b.cols().data(); + } else { + phi::MetaTensor crows_meta(&b_crows_int); + crows_meta.set_dims(mat_b.crows().dims()); + phi::MetaTensor cols_meta(&b_cols_int); + cols_meta.set_dims(mat_b.cols().dims()); + + phi::CastKernel( + dev_ctx_, mat_b.crows(), phi::DataType::INT32, &b_crows_int); + phi::CastKernel( + dev_ctx_, mat_b.cols(), phi::DataType::INT32, &b_cols_int); + + b_crows_data = b_crows_int.data(); + b_cols_data = b_cols_int.data(); + } + + const T* a_values_data = mat_a.values().data(); + const T* b_values_data = mat_b.values().data(); + const int32_t* out_crows_data = mat_out->crows().data(); + + std::vector a_batch_nnz_vec(batch_size); + std::vector b_batch_nnz_vec(batch_size); + + if (batch_size == 1) { + a_batch_nnz_vec[0] = mat_a.nnz(); + b_batch_nnz_vec[0] = mat_b.nnz(); + } else { + phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc( + dev_ctx_.GetPlace(), + batch_size * sizeof(int32_t), + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); + void* tmp_buffer_ptr = tmp_buffer->ptr(); + + GetBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( + a_crows_data, a_rows, static_cast(tmp_buffer_ptr)); + phi::backends::gpu::GpuMemcpyAsync(a_batch_nnz_vec.data(), + tmp_buffer_ptr, + batch_size * sizeof(int32_t), + gpuMemcpyDeviceToHost, + dev_ctx_.stream()); + + GetBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( + b_crows_data, b_rows, static_cast(tmp_buffer_ptr)); + phi::backends::gpu::GpuMemcpyAsync(b_batch_nnz_vec.data(), + tmp_buffer_ptr, + batch_size * sizeof(int32_t), + gpuMemcpyDeviceToHost, + dev_ctx_.stream()); + } + + std::vector out_batch_cols_vec(batch_size); + std::vector out_batch_values_vec(batch_size); cudaDataType_t gpu_type = GetGpuDataType(); - size_t buffer_a_size = 0, buffer_b_size = 0; - cusparseSpGEMMDescr_t spgemmDesc; - phi::dynload::cusparseSpGEMM_createDescr(&spgemmDesc); + for (int i = 0; i < batch_size; ++i) { + int32_t a_batch_nnz = a_batch_nnz_vec[i]; + int32_t b_batch_nnz = b_batch_nnz_vec[i]; - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpGEMM_workEstimation(handle, - GetTransposeOperation(transa), - GetTransposeOperation(transb), - &alpha, - a_descriptor.descriptor(), - b_descriptor.descriptor(), - &beta, - out_descriptor.descriptor(), - gpu_type, - CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc, - &buffer_a_size, - nullptr); - }); + const int32_t* a_batch_crows_data = a_crows_data + i * (a_rows + 1); + const int32_t* a_batch_cols_data = a_cols_data + i * a_batch_nnz; + const T* a_batch_values_data = a_values_data + i * a_batch_nnz; - phi::Allocator::AllocationPtr tmp_buffer_a = phi::memory_utils::Alloc( - dev_ctx_.GetPlace(), - buffer_a_size, - phi::Stream(reinterpret_cast(dev_ctx_.stream()))); - void* tmp_buffer_a_ptr = tmp_buffer_a->ptr(); + const int32_t* b_batch_crows_data = b_crows_data + i * (b_rows + 1); + const int32_t* b_batch_cols_data = b_cols_data + i * b_batch_nnz; + const T* b_batch_values_data = b_values_data + i * b_batch_nnz; - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpGEMM_workEstimation(handle, - GetTransposeOperation(transa), - GetTransposeOperation(transb), - &alpha, - a_descriptor.descriptor(), - b_descriptor.descriptor(), - &beta, - out_descriptor.descriptor(), - gpu_type, - CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc, - &buffer_a_size, - tmp_buffer_a_ptr); - }); + const int32_t* out_batch_crows_data = out_crows_data + i * (a_rows + 1); - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpGEMM_compute(handle, - GetTransposeOperation(transa), - GetTransposeOperation(transb), - &alpha, - a_descriptor.descriptor(), - b_descriptor.descriptor(), - &beta, - out_descriptor.descriptor(), - gpu_type, - CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc, - &buffer_b_size, - nullptr); - }); + cusparseSpMatDescr_t a_batch_desc, b_batch_desc, out_batch_desc; - phi::Allocator::AllocationPtr tmp_buffer_b = phi::memory_utils::Alloc( - dev_ctx_.GetPlace(), - buffer_b_size, - phi::Stream(reinterpret_cast(dev_ctx_.stream()))); - void* tmp_buffer_b_ptr = tmp_buffer_b->ptr(); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCreateCsr(&a_batch_desc, + a_rows, + a_cols, + a_batch_nnz, + const_cast(a_batch_crows_data), + const_cast(a_batch_cols_data), + const_cast(a_batch_values_data), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + }); - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpGEMM_compute(handle, - GetTransposeOperation(transa), - GetTransposeOperation(transb), - &alpha, - a_descriptor.descriptor(), - b_descriptor.descriptor(), - &beta, - out_descriptor.descriptor(), - gpu_type, - CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc, - &buffer_b_size, - tmp_buffer_b_ptr); - }); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCreateCsr(&b_batch_desc, + b_rows, + b_cols, + b_batch_nnz, + const_cast(b_batch_crows_data), + const_cast(b_batch_cols_data), + const_cast(b_batch_values_data), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + }); - int64_t out_crows_size, out_cols_size, out_values_size; - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpMatGetSize(out_descriptor.descriptor(), - &out_crows_size, - &out_cols_size, - &out_values_size); - }); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCreateCsr(&out_batch_desc, + a_rows, + b_cols, + 0, + nullptr, + nullptr, + nullptr, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + }); - // Reallocate space for cols and values of mat_out - mat_out_cols->Resize(common::make_dim(out_values_size)); - dev_ctx_.template Alloc(mat_out_cols); - mat_out_values->Resize(common::make_dim(out_values_size)); - dev_ctx_.template Alloc(mat_out_values); + size_t buffer_a_size = 0, buffer_b_size = 0; + cusparseSpGEMMDescr_t spgemm_desc; + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_createDescr(&spgemm_desc); + }); - phi::dynload::cusparseCsrSetPointers( - out_descriptor.descriptor(), - const_cast(mat_out_crows->data()), - const_cast(mat_out_cols->data()), - const_cast(mat_out_values->data())); + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_workEstimation(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_batch_desc, + b_batch_desc, + &beta, + out_batch_desc, + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemm_desc, + &buffer_a_size, + nullptr); + }); - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { - phi::dynload::cusparseSpGEMM_copy(handle, - GetTransposeOperation(transa), - GetTransposeOperation(transb), - &alpha, - a_descriptor.descriptor(), - b_descriptor.descriptor(), - &beta, - out_descriptor.descriptor(), - gpu_type, - CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc); - }); + phi::Allocator::AllocationPtr tmp_buffer_a = phi::memory_utils::Alloc( + dev_ctx_.GetPlace(), + buffer_a_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); + void* tmp_buffer_a_ptr = tmp_buffer_a->ptr(); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_workEstimation(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_batch_desc, + b_batch_desc, + &beta, + out_batch_desc, + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemm_desc, + &buffer_a_size, + tmp_buffer_a_ptr); + }); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_compute(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_batch_desc, + b_batch_desc, + &beta, + out_batch_desc, + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemm_desc, + &buffer_b_size, + nullptr); + }); + + phi::Allocator::AllocationPtr tmp_buffer_b = phi::memory_utils::Alloc( + dev_ctx_.GetPlace(), + buffer_b_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); + void* tmp_buffer_b_ptr = tmp_buffer_b->ptr(); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_compute(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_batch_desc, + b_batch_desc, + &beta, + out_batch_desc, + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemm_desc, + &buffer_b_size, + tmp_buffer_b_ptr); + }); + + int64_t out_num_crows, out_num_cols, out_num_values; + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpMatGetSize( + out_batch_desc, &out_num_crows, &out_num_cols, &out_num_values); + }); + + out_batch_cols_vec[i].Resize(common::make_dim(out_num_values)); + dev_ctx_.template Alloc(&out_batch_cols_vec[i]); + out_batch_values_vec[i].Resize(common::make_dim(out_num_values)); + dev_ctx_.template Alloc(&out_batch_values_vec[i]); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseCsrSetPointers( + out_batch_desc, + const_cast(out_batch_crows_data), + const_cast(out_batch_cols_vec[i].data()), + const_cast(out_batch_values_vec[i].data())); + }); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_copy(handle, + GetTransposeOperation(transa), + GetTransposeOperation(transb), + &alpha, + a_batch_desc, + b_batch_desc, + &beta, + out_batch_desc, + gpu_type, + CUSPARSE_SPGEMM_DEFAULT, + spgemm_desc); + }); + + dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { + phi::dynload::cusparseSpGEMM_destroyDescr(spgemm_desc); + }); + } + + if (batch_size == 1) { + *(mat_out->mutable_cols()) = std::move(out_batch_cols_vec[0]); + *(mat_out->mutable_values()) = std::move(out_batch_values_vec[0]); + + } else { + std::vector cols_vec; + std::vector values_vec; + + for (int i = 0; i < batch_size; ++i) { + cols_vec.push_back(&out_batch_cols_vec[i]); + values_vec.push_back(&out_batch_values_vec[i]); + } + + phi::ConcatKernel(dev_ctx_, cols_vec, 0, mat_out->mutable_cols()); + phi::ConcatKernel(dev_ctx_, values_vec, 0, mat_out->mutable_values()); + } if (mat_a.crows().dtype() == phi::DataType::INT64 || mat_b.crows().dtype() == phi::DataType::INT64) { @@ -732,8 +830,6 @@ void SparseBlas::SPGEMM(bool transa, phi::CastKernel( dev_ctx_, *mat_out_cols, phi::DataType::INT64, mat_out_cols); } - - phi::dynload::cusparseSpGEMM_destroyDescr(spgemmDesc); } } // namespace sparse } // namespace funcs diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index 1d320fdd4266f..21389dedca88b 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -197,12 +197,14 @@ void MatmulCooCooGradKernel(const Context& dev_ctx, SparseCooTensor* dx, SparseCooTensor* dy) { // cusparseSpGEMM only support CSR now, so use COO->CSR->COO. - SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); - SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); - SparseCsrTensor dout_csr = CooToCsr(dev_ctx, dout); - SparseCsrTensor dx_csr, dy_csr; - dx_csr.set_dims(dx->dims()); - dy_csr.set_dims(dy->dims()); + SparseCsrTensor x_csr, y_csr, dout_csr, dx_csr, dy_csr; + CooToCsrKernel(dev_ctx, x, &x_csr); + CooToCsrKernel(dev_ctx, y, &y_csr); + CooToCsrKernel(dev_ctx, dout, &dout_csr); + MetaTensor meta_dx_csr(&dx_csr); + phi::UnchangedInferMeta(dx, &meta_dx_csr); + MetaTensor meta_dy_csr(&dy_csr); + phi::UnchangedInferMeta(dy, &meta_dy_csr); MatmulCsrCsrGradKernel(dev_ctx, x_csr, y_csr, dout_csr, &dx_csr, &dy_csr); CsrToCooKernel(dev_ctx, dx_csr, dx); CsrToCooKernel(dev_ctx, dy_csr, dy); diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index 3d9ff843fd113..320ff86e93cd4 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -96,8 +96,7 @@ def test_matmul_3d(self): class TestMatmulSparseSparseIndex64(unittest.TestCase): # x: sparse, y: sparse, out: sparse def check_result(self, x_shape, y_shape, format): - mask = paddle.randint(0, 2, x_shape) - origin_x = paddle.rand(x_shape) * mask + origin_x = paddle.rand(x_shape) origin_y = paddle.rand(y_shape) dense_x = origin_x.detach() @@ -121,17 +120,17 @@ def check_result(self, x_shape, y_shape, format): np.testing.assert_allclose( sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 ) - if get_cuda_version() >= 11030: - dense_out.backward() - sp_out.backward() - np.testing.assert_allclose( - sp_x.grad.to_dense().numpy(), - dense_x.grad.numpy(), - rtol=1e-05, - ) - np.testing.assert_allclose( - sp_y.grad.to_dense().numpy(), dense_y.grad.numpy(), rtol=1e-05 - ) + + dense_out.backward() + sp_out.backward() + np.testing.assert_allclose( + sp_x.grad.to_dense().numpy(), + dense_x.grad.numpy(), + rtol=1e-05, + ) + np.testing.assert_allclose( + sp_y.grad.to_dense().numpy(), dense_y.grad.numpy(), rtol=1e-05 + ) @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, @@ -142,8 +141,8 @@ def test_matmul_2d(self): self.check_result([16, 12], [12, 10], 'csr') @unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, - "only support cuda>=11.8", + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, + "only support cuda>=11.0", ) def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'coo') @@ -153,8 +152,7 @@ def test_matmul_3d(self): class TestMatmulSparseSparseIndex32(unittest.TestCase): # x: sparse, y: sparse, out: sparse def check_result(self, x_shape, y_shape, format): - mask = paddle.randint(0, 2, x_shape) - origin_x = paddle.rand(x_shape) * mask + origin_x = paddle.rand(x_shape) origin_y = paddle.rand(y_shape) dense_x = origin_x.detach() @@ -196,23 +194,22 @@ def check_result(self, x_shape, y_shape, format): sp_x.stop_gradient = False sp_y.stop_gradient = False - sp_out = paddle.sparse.matmul(sp_x, sp_y) np.testing.assert_allclose( sp_out.to_dense().numpy(), dense_out.numpy(), rtol=1e-05 ) - if get_cuda_version() >= 11030: - dense_out.backward() - sp_out.backward() - np.testing.assert_allclose( - sp_x.grad.to_dense().numpy(), - dense_x.grad.numpy(), - rtol=1e-05, - ) - np.testing.assert_allclose( - sp_y.grad.to_dense().numpy(), dense_y.grad.numpy(), rtol=1e-05 - ) + + dense_out.backward() + sp_out.backward() + np.testing.assert_allclose( + sp_x.grad.to_dense().numpy(), + dense_x.grad.numpy(), + rtol=1e-05, + ) + np.testing.assert_allclose( + sp_y.grad.to_dense().numpy(), dense_y.grad.numpy(), rtol=1e-05 + ) @unittest.skipIf( not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, @@ -223,8 +220,8 @@ def test_matmul_2d(self): self.check_result([16, 12], [12, 10], 'csr') @unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11080, - "only support cuda>=11.8", + not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, + "only support cuda>=11.0", ) def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'coo') From 28449d483b4eeba6d31692454fbe94887a0398e8 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 9 Jan 2024 17:18:00 +0000 Subject: [PATCH 20/25] fix --- .../kernels/funcs/sparse/sparse_blas_impl.cu.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 0ca2f5256b3b6..b0bb531592e09 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -496,9 +496,9 @@ void SparseBlas::SDDMM(bool transa, /************* SPARSE*SPARSE->SPARSE MATMUL ************/ template -__global__ void GetBatchNNZ(const int32_t* crow_data, - int64_t rows, - int32_t* batch_nnz) { +__global__ void GetCsrBatchNNZ(const int32_t* crow_data, + int64_t rows, + int32_t* batch_nnz) { int64_t i = static_cast(threadIdx.x); batch_nnz[i] = crow_data[(i + 1) * (rows + 1) - 1]; } @@ -542,12 +542,6 @@ void SparseBlas::SPGEMM(bool transa, auto b_ndims = b_dim_vec.size(); const int64_t b_rows = b_dim_vec[b_ndims - 2]; const int64_t b_cols = b_dim_vec[b_ndims - 1]; - int b_batch_size = 1; - for (int i = 0; i < b_ndims - 2; i++) { - b_batch_size *= b_dim_vec[i]; - } - - const int batch_size = a_batch_size; // cusparseSpGEMM only support 32-bit indices. DenseTensor a_crows_int, a_cols_int, b_crows_int, b_cols_int; @@ -593,6 +587,7 @@ void SparseBlas::SPGEMM(bool transa, const T* b_values_data = mat_b.values().data(); const int32_t* out_crows_data = mat_out->crows().data(); + const int batch_size = a_batch_size; std::vector a_batch_nnz_vec(batch_size); std::vector b_batch_nnz_vec(batch_size); @@ -606,7 +601,7 @@ void SparseBlas::SPGEMM(bool transa, phi::Stream(reinterpret_cast(dev_ctx_.stream()))); void* tmp_buffer_ptr = tmp_buffer->ptr(); - GetBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( + GetCsrBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( a_crows_data, a_rows, static_cast(tmp_buffer_ptr)); phi::backends::gpu::GpuMemcpyAsync(a_batch_nnz_vec.data(), tmp_buffer_ptr, @@ -614,7 +609,7 @@ void SparseBlas::SPGEMM(bool transa, gpuMemcpyDeviceToHost, dev_ctx_.stream()); - GetBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( + GetCsrBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( b_crows_data, b_rows, static_cast(tmp_buffer_ptr)); phi::backends::gpu::GpuMemcpyAsync(b_batch_nnz_vec.data(), tmp_buffer_ptr, From 76e5ed907561a344842dc8c1deb2b04d6ca119c3 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Tue, 9 Jan 2024 17:28:05 +0000 Subject: [PATCH 21/25] fix --- test/legacy_test/test_sparse_matmul_op.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/legacy_test/test_sparse_matmul_op.py b/test/legacy_test/test_sparse_matmul_op.py index 320ff86e93cd4..ae08b7df48c53 100644 --- a/test/legacy_test/test_sparse_matmul_op.py +++ b/test/legacy_test/test_sparse_matmul_op.py @@ -93,7 +93,7 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmulSparseSparseIndex64(unittest.TestCase): +class TestMatmulSparseSparseInt64Index(unittest.TestCase): # x: sparse, y: sparse, out: sparse def check_result(self, x_shape, y_shape, format): origin_x = paddle.rand(x_shape) @@ -149,7 +149,7 @@ def test_matmul_3d(self): self.check_result([8, 16, 12], [8, 12, 10], 'csr') -class TestMatmulSparseSparseIndex32(unittest.TestCase): +class TestMatmulSparseSparseInt32Index(unittest.TestCase): # x: sparse, y: sparse, out: sparse def check_result(self, x_shape, y_shape, format): origin_x = paddle.rand(x_shape) From ee0fee5057f8561145558bf4c21d4dbf83869eaa Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Wed, 10 Jan 2024 00:20:26 +0000 Subject: [PATCH 22/25] fix cuda version check --- paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu | 6 +++--- paddle/phi/kernels/sparse/gpu/matmul_kernel.cu | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index 21389dedca88b..83487697dcc75 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -147,7 +147,7 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, const SparseCsrTensor& dout, SparseCsrTensor* dx, SparseCsrTensor* dy) { -#if CUDA_VERSION >= 11030 +#if CUDA_VERSION >= 11000 auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); std::vector xdim_vec = phi::vectorize(x.dims()); @@ -183,8 +183,8 @@ void MatmulCsrCsrGradKernel(const Context& dev_ctx, #else #ifdef PADDLE_WITH_CUDA PADDLE_THROW(phi::errors::Unimplemented( - "backward of 'sparse.matmul' use cusparseSPGEMM, which is supported from " - "CUDA 11.3")); + "backward of 'sparse.matmul' use cusparseSpGEMM, which is supported from " + "CUDA 11.0")); #endif #endif } diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index baddad7dc2764..fb8109202cf42 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -177,7 +177,7 @@ void MatmulCooCooKernel(const Context& dev_ctx, const SparseCooTensor& x, const SparseCooTensor& y, SparseCooTensor* out) { - // 'cusparseSPGEMM' only support CSR now, so use COO->CSR->COO. + // 'cusparseSpGEMM' only support CSR now, so use COO->CSR->COO. SparseCsrTensor x_csr = CooToCsr(dev_ctx, x); SparseCsrTensor y_csr = CooToCsr(dev_ctx, y); SparseCsrTensor out_csr; From 0d239358f45ebaf267ec6301659a550a579416a5 Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Wed, 10 Jan 2024 00:25:23 +0000 Subject: [PATCH 23/25] fix --- paddle/phi/kernels/sparse/impl/unary_kernel_impl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h index 723bf3d2697cf..32fe4ae07ab67 100644 --- a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h +++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h @@ -201,8 +201,6 @@ void CastCsrKernel(const Context& dev_ctx, meta.set_dims(x_values.dims()); phi::CastKernel(dev_ctx, x_values, value_dtype, out_values); } - - out->set_dims(x.dims()); } template From e01251be3325a0b02e98166fd47d1bd5f385d2fe Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Thu, 11 Jan 2024 05:11:48 +0000 Subject: [PATCH 24/25] fix bugs --- .../funcs/sparse/sparse_blas_impl.cu.h | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index b0bb531592e09..faf3bffa4df0a 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -622,22 +622,21 @@ void SparseBlas::SPGEMM(bool transa, std::vector out_batch_values_vec(batch_size); cudaDataType_t gpu_type = GetGpuDataType(); - for (int i = 0; i < batch_size; ++i) { - int32_t a_batch_nnz = a_batch_nnz_vec[i]; - int32_t b_batch_nnz = b_batch_nnz_vec[i]; + const int32_t* a_batch_crows_data = a_crows_data; + const int32_t* a_batch_cols_data = a_cols_data; + const T* a_batch_values_data = a_values_data; - const int32_t* a_batch_crows_data = a_crows_data + i * (a_rows + 1); - const int32_t* a_batch_cols_data = a_cols_data + i * a_batch_nnz; - const T* a_batch_values_data = a_values_data + i * a_batch_nnz; + const int32_t* b_batch_crows_data = b_crows_data; + const int32_t* b_batch_cols_data = b_cols_data; + const T* b_batch_values_data = b_values_data; - const int32_t* b_batch_crows_data = b_crows_data + i * (b_rows + 1); - const int32_t* b_batch_cols_data = b_cols_data + i * b_batch_nnz; - const T* b_batch_values_data = b_values_data + i * b_batch_nnz; + const int32_t* out_batch_crows_data = out_crows_data; - const int32_t* out_batch_crows_data = out_crows_data + i * (a_rows + 1); + for (int i = 0; i < batch_size; ++i) { + int32_t a_batch_nnz = a_batch_nnz_vec[i]; + int32_t b_batch_nnz = b_batch_nnz_vec[i]; cusparseSpMatDescr_t a_batch_desc, b_batch_desc, out_batch_desc; - dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseCreateCsr(&a_batch_desc, a_rows, @@ -799,6 +798,16 @@ void SparseBlas::SPGEMM(bool transa, dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpGEMM_destroyDescr(spgemm_desc); }); + + a_batch_crows_data += a_rows + 1; + a_batch_cols_data += a_batch_nnz; + a_batch_values_data += a_batch_nnz; + + b_batch_crows_data += b_rows + 1; + b_batch_cols_data += b_batch_nnz; + b_batch_values_data += b_batch_nnz; + + out_batch_crows_data += a_rows + 1; } if (batch_size == 1) { From 1f34f1c643b83981d05017f76e955de8e032682d Mon Sep 17 00:00:00 2001 From: MayYouBeProsperous Date: Thu, 11 Jan 2024 09:45:27 +0000 Subject: [PATCH 25/25] fix --- .../funcs/sparse/sparse_blas_impl.cu.h | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index faf3bffa4df0a..2f5f21745a4fa 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -496,7 +496,7 @@ void SparseBlas::SDDMM(bool transa, /************* SPARSE*SPARSE->SPARSE MATMUL ************/ template -__global__ void GetCsrBatchNNZ(const int32_t* crow_data, +__global__ void GetCsrBatchNnz(const int32_t* crow_data, int64_t rows, int32_t* batch_nnz) { int64_t i = static_cast(threadIdx.x); @@ -521,14 +521,6 @@ void SparseBlas::SPGEMM(bool transa, out_crows_meta.set_dims(mat_a.crows().dims()); dev_ctx_.template Alloc(mat_out_crows); - MetaTensor out_cols_meta(mat_out_cols); - out_cols_meta.set_dtype(phi::DataType::INT32); - dev_ctx_.template Alloc(mat_out_cols); - - MetaTensor out_values_meta(mat_out_values); - out_values_meta.set_dtype(mat_a.values().dtype()); - dev_ctx_.template Alloc(mat_out_values); - std::vector a_dim_vec = common::vectorize(mat_a.dims()); auto a_ndims = a_dim_vec.size(); const int64_t a_rows = a_dim_vec[a_ndims - 2]; @@ -544,43 +536,49 @@ void SparseBlas::SPGEMM(bool transa, const int64_t b_cols = b_dim_vec[b_ndims - 1]; // cusparseSpGEMM only support 32-bit indices. - DenseTensor a_crows_int, a_cols_int, b_crows_int, b_cols_int; - const int32_t *a_crows_data, *a_cols_data, *b_crows_data, *b_cols_data; + const int32_t *a_crows_data = nullptr, *a_cols_data = nullptr, + *b_crows_data = nullptr, *b_cols_data = nullptr; + std::shared_ptr a_crows_int = nullptr, a_cols_int = nullptr, + b_crows_int = nullptr, b_cols_int = nullptr; if (mat_a.crows().dtype() == phi::DataType::INT32) { a_crows_data = mat_a.crows().data(); a_cols_data = mat_a.cols().data(); } else { - phi::MetaTensor crows_meta(&a_crows_int); + a_crows_int = std::make_shared(); + a_cols_int = std::make_shared(); + phi::MetaTensor crows_meta(a_crows_int.get()); crows_meta.set_dims(mat_a.crows().dims()); - phi::MetaTensor cols_meta(&a_cols_int); + phi::MetaTensor cols_meta(a_cols_int.get()); cols_meta.set_dims(mat_a.cols().dims()); phi::CastKernel( - dev_ctx_, mat_a.crows(), phi::DataType::INT32, &a_crows_int); + dev_ctx_, mat_a.crows(), phi::DataType::INT32, a_crows_int.get()); phi::CastKernel( - dev_ctx_, mat_a.cols(), phi::DataType::INT32, &a_cols_int); + dev_ctx_, mat_a.cols(), phi::DataType::INT32, a_cols_int.get()); - a_crows_data = a_crows_int.data(); - a_cols_data = a_cols_int.data(); + a_crows_data = a_crows_int->data(); + a_cols_data = a_cols_int->data(); } if (mat_b.crows().dtype() == phi::DataType::INT32) { b_crows_data = mat_b.crows().data(); b_cols_data = mat_b.cols().data(); } else { - phi::MetaTensor crows_meta(&b_crows_int); + b_crows_int = std::make_shared(); + b_cols_int = std::make_shared(); + phi::MetaTensor crows_meta(b_crows_int.get()); crows_meta.set_dims(mat_b.crows().dims()); - phi::MetaTensor cols_meta(&b_cols_int); + phi::MetaTensor cols_meta(b_cols_int.get()); cols_meta.set_dims(mat_b.cols().dims()); phi::CastKernel( - dev_ctx_, mat_b.crows(), phi::DataType::INT32, &b_crows_int); + dev_ctx_, mat_b.crows(), phi::DataType::INT32, b_crows_int.get()); phi::CastKernel( - dev_ctx_, mat_b.cols(), phi::DataType::INT32, &b_cols_int); + dev_ctx_, mat_b.cols(), phi::DataType::INT32, b_cols_int.get()); - b_crows_data = b_crows_int.data(); - b_cols_data = b_cols_int.data(); + b_crows_data = b_crows_int->data(); + b_cols_data = b_cols_int->data(); } const T* a_values_data = mat_a.values().data(); @@ -601,7 +599,7 @@ void SparseBlas::SPGEMM(bool transa, phi::Stream(reinterpret_cast(dev_ctx_.stream()))); void* tmp_buffer_ptr = tmp_buffer->ptr(); - GetCsrBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( + GetCsrBatchNnz<<<1, batch_size, 0, dev_ctx_.stream()>>>( a_crows_data, a_rows, static_cast(tmp_buffer_ptr)); phi::backends::gpu::GpuMemcpyAsync(a_batch_nnz_vec.data(), tmp_buffer_ptr, @@ -609,7 +607,7 @@ void SparseBlas::SPGEMM(bool transa, gpuMemcpyDeviceToHost, dev_ctx_.stream()); - GetCsrBatchNNZ<<<1, batch_size, 0, dev_ctx_.stream()>>>( + GetCsrBatchNnz<<<1, batch_size, 0, dev_ctx_.stream()>>>( b_crows_data, b_rows, static_cast(tmp_buffer_ptr)); phi::backends::gpu::GpuMemcpyAsync(b_batch_nnz_vec.data(), tmp_buffer_ptr, @@ -815,8 +813,7 @@ void SparseBlas::SPGEMM(bool transa, *(mat_out->mutable_values()) = std::move(out_batch_values_vec[0]); } else { - std::vector cols_vec; - std::vector values_vec; + std::vector cols_vec, values_vec; for (int i = 0; i < batch_size; ++i) { cols_vec.push_back(&out_batch_cols_vec[i]);