Skip to content

Commit

Permalink
ad more error info for cublasLtMatmul (pytorch#89983)
Browse files Browse the repository at this point in the history
hit an error at 'cublasLtMatmul' when running bfloat16 for a complicate model, this error info will help debugging and also is  good for future error reporting
Pull Request resolved: pytorch#89983
Approved by: https://github.com/ngimel
  • Loading branch information
zhaojuanmao authored and pytorchmergebot committed Dec 1, 2022
1 parent a747326 commit 850b53b
Showing 1 changed file with 28 additions and 2 deletions.
30 changes: 28 additions & 2 deletions aten/src/ATen/cuda/CUDABlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ void gemm_and_bias(
TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
}

TORCH_CUDABLAS_CHECK(cublasLtMatmul(
cublasStatus_t cublasStatus = cublasLtMatmul(
ltHandle,
computeDesc.descriptor(),
&alpha_val,
Expand All @@ -757,7 +757,33 @@ void gemm_and_bias(
&heuristicResult.algo,
workspace.data_ptr(),
workspaceSize,
at::cuda::getCurrentCUDAStream()));
at::cuda::getCurrentCUDAStream());
TORCH_CHECK(
cublasStatus == CUBLAS_STATUS_SUCCESS,
"CUDA error: ",
at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
" when calling cublasLtMatmul with transpose_mat1 ",
transpose_mat1,
" transpose_mat2 ",
transpose_mat2,
" m ",
m,
" n ",
n,
" k ",
k,
" mat1_ld ",
mat1_ld,
" mat2_ld ",
mat2_ld,
" result_ld ",
result_ld,
" abcType ",
abcType,
" computeType ",
computeType,
" scaleType ",
scaleType);
}

template void gemm_and_bias(
Expand Down

0 comments on commit 850b53b

Please sign in to comment.