Skip to content

Commit

Permalink
[Feature](bangc-ops): redefine sizeof in MLU372
Browse files Browse the repository at this point in the history
  • Loading branch information
PetrelYy committed Aug 9, 2023
1 parent 91b67ce commit 36c423e
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 112 deletions.
1 change: 1 addition & 0 deletions bangc-ops/kernels/abs/abs_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "core/logging.h"
#include "kernels/debug.h"
#include "kernels/kernel.h"
#include "kernels/unary_op/unary_op_3pipeline.h"
#include "kernels/unary_op/unary_op_5pipeline.h"

Expand Down
5 changes: 5 additions & 0 deletions bangc-ops/kernels/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
#endif // __BANG_ARCH__


#if __BANG_ARCH__ == 372
#define sizeof(T) (uint32_t(sizeof(T)))
#endif // __BANG_ARCH__ == 372


#ifndef PAD_UP
#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
#endif
Expand Down
12 changes: 6 additions & 6 deletions bangc-ops/kernels/log/log.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ mluOpLog(mluOpHandle_t handle, const mluOpComputationPreference_t prefer,
int element_num = mluOpGetTensorElementNum(x_desc);
if (handle->arch == MLUOP_MLU270) {
VLOG(5) << "kernel Kernel5StagePipelineLog.";
KERNEL_CHECK(
(Kernel5StagePipelineLog(k_dim, k_type, handle->queue, x_desc->dtype,
prefer, x, y, element_num, coef)));
CHECK_RETURN("[mluOpLog] ", (Kernel5StagePipelineLog(
k_dim, k_type, handle->queue, x_desc->dtype,
prefer, x, y, element_num, coef)));
} else {
VLOG(5) << "kernel Kernel3StagePipelineLog.";
KERNEL_CHECK(
(Kernel3StagePipelineLog(k_dim, k_type, handle->queue, x_desc->dtype,
prefer, x, y, element_num, coef)));
CHECK_RETURN("[mluOpLog] ", (Kernel3StagePipelineLog(
k_dim, k_type, handle->queue, x_desc->dtype,
prefer, x, y, element_num, coef)));
}
GEN_CASE_END();
return MLUOP_STATUS_SUCCESS;
Expand Down
4 changes: 2 additions & 2 deletions bangc-ops/kernels/log/log.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@

#include "mlu_op.h"

void MLUOP_WIN_API Kernel3StagePipelineLog(
mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineLog(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num, float coef);

void MLUOP_WIN_API Kernel5StagePipelineLog(
mluOpStatus_t MLUOP_WIN_API Kernel5StagePipelineLog(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num, float coef);
Expand Down
73 changes: 35 additions & 38 deletions bangc-ops/kernels/log/log_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@
*************************************************************************/
#include "log.h"

#include "core/logging.h"
#include "kernels/debug.h"
#include "kernels/kernel.h"
#include "kernels/unary_op/unary_op_3pipeline.h"
#include "kernels/unary_op/unary_op_5pipeline.h"
#include "kernels/debug.h"
#include "core/logging.h"

#define LOG_LOW_BOUND 1e-8
#define LOG_SCALE 1e12
Expand Down Expand Up @@ -165,52 +166,48 @@ UNARY_OP_KERNEL_5PIPELINE_IMPLE(Log, float, Fast);
UNARY_OP_KERNEL_5PIPELINE_IMPLE(Log, half, Fast);
UNARY_OP_KERNEL_5PIPELINE_IMPLE(Log, half, HighAcc);

void MLUOP_WIN_API Kernel3StagePipelineLog(
mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineLog(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num, float coef) {
switch (d_type) {
case MLUOP_DTYPE_FLOAT: {
MLUBlockKernel3StagePipelineLogfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef);
}; break;
case MLUOP_DTYPE_HALF: {
if (prefer == MLUOP_COMPUTATION_FAST) {
MLUBlockKernel3StagePipelineLoghalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef);
} else {
MLUBlockKernel3StagePipelineLoghalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef);
}
}; break;
default: {
LOG(ERROR) << "Not implemented.";
break;
// launch kernel
if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineLogfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef));
} else if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) {
if (prefer == MLUOP_COMPUTATION_FAST) {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineLoghalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef));
} else {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineLoghalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef));
}
}
return MLUOP_STATUS_SUCCESS;
}

void MLUOP_WIN_API Kernel5StagePipelineLog(
mluOpStatus_t MLUOP_WIN_API Kernel5StagePipelineLog(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num, float coef) {
switch (d_type) {
case MLUOP_DTYPE_FLOAT: {
MLUBlockKernel5StagePipelineLogfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef);
}; break;
case MLUOP_DTYPE_HALF: {
if (prefer == MLUOP_COMPUTATION_FAST) {
MLUBlockKernel5StagePipelineLoghalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef);
} else {
MLUBlockKernel5StagePipelineLoghalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef);
}
}; break;
default: {
LOG(ERROR) << "Not implemented.";
break;
// launch kernel
if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
KERNEL_CHECK(
MLUBlockKernel5StagePipelineLogfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef));
} else if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) {
if (prefer == MLUOP_COMPUTATION_FAST) {
KERNEL_CHECK(
MLUBlockKernel5StagePipelineLoghalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef));
} else {
KERNEL_CHECK(
MLUBlockKernel5StagePipelineLoghalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, coef));
}
}
return MLUOP_STATUS_SUCCESS;
}
19 changes: 10 additions & 9 deletions bangc-ops/kernels/sqrt/sqrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ mluOpStatus_t MLUOP_WIN_API mluOpSqrt(mluOpHandle_t handle,
int element_num = mluOpGetTensorElementNum(x_desc);
if (handle->arch == MLUOP_MLU270) {
VLOG(5) << "kernel Kernel5StagePipelineSqrt.";
KERNEL_CHECK(
(Kernel5StagePipelineSqrt(k_dim, k_type, handle->queue, x_desc->dtype,
prefer, x, y, element_num)));
CHECK_RETURN("[mluOpSqrt] ", Kernel5StagePipelineSqrt(
k_dim, k_type, handle->queue,
x_desc->dtype, prefer, x, y, element_num));
} else {
VLOG(5) << "kernel Kernel3StagePipelineSqrt.";
KERNEL_CHECK(
(Kernel3StagePipelineSqrt(k_dim, k_type, handle->queue, x_desc->dtype,
prefer, x, y, element_num)));
CHECK_RETURN("[mluOpSqrt] ", Kernel3StagePipelineSqrt(
k_dim, k_type, handle->queue,
x_desc->dtype, prefer, x, y, element_num));
}

GEN_CASE_END();
Expand Down Expand Up @@ -113,9 +113,10 @@ mluOpStatus_t MLUOP_WIN_API mluOpSqrtBackward(

int num_elem = mluOpGetTensorElementNum(y_desc);
VLOG(5) << "Kernel Kernel3StagePipelineSqrtBackward.";
KERNEL_CHECK((Kernel3StagePipelineSqrtBackward(k_dim, k_type, handle->queue,
y_desc->dtype, y, diff_y,
diff_x, num_elem)));
CHECK_RETURN("[mluOpSqrtBackward] ",
Kernel3StagePipelineSqrtBackward(k_dim, k_type, handle->queue,
y_desc->dtype, y, diff_y,
diff_x, num_elem));
GEN_CASE_END();
return MLUOP_STATUS_SUCCESS;
}
6 changes: 3 additions & 3 deletions bangc-ops/kernels/sqrt/sqrt.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@

#include "mlu_op.h"

void MLUOP_WIN_API Kernel3StagePipelineSqrt(
mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineSqrt(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num);

void MLUOP_WIN_API Kernel5StagePipelineSqrt(
mluOpStatus_t MLUOP_WIN_API Kernel5StagePipelineSqrt(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num);

void MLUOP_WIN_API Kernel3StagePipelineSqrtBackward(
mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineSqrtBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const void *y, const void *diff_y, void *x,
int num);
Expand Down
99 changes: 46 additions & 53 deletions bangc-ops/kernels/sqrt/sqrt_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
*************************************************************************/
#include "sqrt.h"

#include "core/logging.h"
#include "kernels/binary_op/binary_op_3pipeline.h"
#include "kernels/debug.h"
#include "kernels/unary_op/unary_op_3pipeline.h"
#include "kernels/unary_op/unary_op_5pipeline.h"
#include "kernels/debug.h"
#include "core/logging.h"

#define SQRT_HIGH_BOUND 1e4
#define SQRT_SCALE 1e-6
Expand Down Expand Up @@ -225,74 +225,67 @@ UNARY_OP_KERNEL_5PIPELINE_IMPLE(Sqrt, half, HighAcc);
BINARY_OP_3PIPELINE_IMPLE(SqrtBackward, float, Fast);
BINARY_OP_3PIPELINE_IMPLE(SqrtBackward, half, HighAcc);

void MLUOP_WIN_API Kernel3StagePipelineSqrt(
mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineSqrt(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num) {
switch (d_type) {
case MLUOP_DTYPE_FLOAT: {
MLUBlockKernel3StagePipelineSqrtfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0);
}; break;
case MLUOP_DTYPE_HALF: {
if (prefer == MLUOP_COMPUTATION_FAST) {
MLUBlockKernel3StagePipelineSqrthalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0);
} else {
MLUBlockKernel3StagePipelineSqrthalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0);
}
}; break;
default: {
LOG(ERROR) << "Not implemented.";
break;
// launch kernel
if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineSqrtfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0));
} else if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) {
if (prefer == MLUOP_COMPUTATION_FAST) {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineSqrthalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0));
} else {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineSqrthalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0));
}
}
return MLUOP_STATUS_SUCCESS;
}

void MLUOP_WIN_API Kernel5StagePipelineSqrt(
mluOpStatus_t MLUOP_WIN_API Kernel5StagePipelineSqrt(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const mluOpComputationPreference_t prefer,
const void *x, void *y, int num) {
switch (d_type) {
case MLUOP_DTYPE_FLOAT: {
MLUBlockKernel5StagePipelineSqrtfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0);
}; break;
case MLUOP_DTYPE_HALF: {
if (prefer == MLUOP_COMPUTATION_FAST) {
MLUBlockKernel5StagePipelineSqrthalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0);
} else {
MLUBlockKernel5StagePipelineSqrthalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0);
}
}; break;
default: {
LOG(ERROR) << "Not implemented.";
break;
// launch kernel
if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
KERNEL_CHECK(
MLUBlockKernel5StagePipelineSqrtfloatFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0));
} else if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) {
if (prefer == MLUOP_COMPUTATION_FAST) {
KERNEL_CHECK(
MLUBlockKernel5StagePipelineSqrthalfFast<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0));
} else {
KERNEL_CHECK(
MLUBlockKernel5StagePipelineSqrthalfHighAcc<<<k_dim, k_type, queue>>>(
(void *)x, (void *)y, num, 0.0));
}
}
return MLUOP_STATUS_SUCCESS;
}

void MLUOP_WIN_API Kernel3StagePipelineSqrtBackward(
mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineSqrtBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t d_type, const void *y, const void *diff_y, void *x,
int num) {
switch (d_type) {
case MLUOP_DTYPE_FLOAT: {
MLUBlockKernel3StagePipelineSqrtBackwardfloatFast<<<k_dim, k_type,
queue>>>(
(void *)y, (void *)diff_y, (void *)x, num);
}; break;
case MLUOP_DTYPE_HALF: {
MLUBlockKernel3StagePipelineSqrtBackwardhalfHighAcc<<<k_dim, k_type,
// launch kernel
if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineSqrtBackwardfloatFast<<<k_dim, k_type,
queue>>>(
(void *)y, (void *)diff_y, (void *)x, num);
}; break;
default: {
LOG(ERROR) << "Not implemented.";
break;
}
(void *)y, (void *)diff_y, (void *)x, num));
} else if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) {
KERNEL_CHECK(
MLUBlockKernel3StagePipelineSqrtBackwardhalfHighAcc<<<k_dim, k_type,
queue>>>(
(void *)y, (void *)diff_y, (void *)x, num));
}
return MLUOP_STATUS_SUCCESS;
}
2 changes: 1 addition & 1 deletion bangc-ops/kernels/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ __mlu_func__ void __mluop_exp(T *nram_dst, T *nram_src, void *nram_addition,
}

/******************************************************************************
* CNNL FUNC: __mluop_log
* MLUOPS FUNC: __mluop_log
* param 'nram_dst' is the nram destination address, which supports half or
* float data type.
* param 'nram_src' is the nram source address, which has the same data type
Expand Down

0 comments on commit 36c423e

Please sign in to comment.