Skip to content

Commit

Permalink
gpu: sycl: sum: implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
t4c1 committed May 31, 2024
1 parent 95c6fcd commit 01454e3
Show file tree
Hide file tree
Showing 10 changed files with 611 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/common/dnnl_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ PKIND_TRAITS_INST(binary);
PKIND_TRAITS_INST(matmul);
PKIND_TRAITS_INST(resampling);
PKIND_TRAITS_INST(reduction);
PKIND_TRAITS_INST(sum);
PKIND_TRAITS_INST(sdpa);
#undef PKIND_TRAITS_INST

Expand Down
23 changes: 21 additions & 2 deletions src/common/sum_pd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,30 @@ struct sum_pd_t : public primitive_desc_t {
}

protected:
sum_desc_t desc_;

int n_;
std::vector<float> scales_;
memory_desc_t dst_md_, dst_acc_md_;
std::vector<memory_desc_t> src_mds_;
memory_desc_t original_dst_md_;

sum_desc_t desc_;
sum_pd_t(const sum_desc_t *adesc, const primitive_attr_t *attr,
const sum_pd_t *hint_fwd)
: primitive_desc_t(attr, primitive_kind::sum)
, desc_(*adesc)
, n_(desc_.n)
, dst_md_(*desc_.dst_md)
, original_dst_md_(*desc_.dst_md) {
scales_.reserve(n_);
for (int i = 0; i < n_; ++i)
scales_.push_back(desc_.scales[i]);
src_mds_.reserve(n_);
for (int i = 0; i < n_; ++i)
src_mds_.push_back(*desc_.src_mds[i]);

init_desc();
}

sum_pd_t(const primitive_attr_t *attr, const memory_desc_t *dst_md, int n,
const float *scales, const memory_desc_t *const *src_mds)
Expand Down Expand Up @@ -218,7 +235,9 @@ struct sum_pd_t : public primitive_desc_t {
if (!new_pd->is_initialized()) return nullptr; \
return new_pd.release(); \
} \
const char *name() const override { return impl_name; }
const char *name() const override { \
return impl_name; \
}

#define DECLARE_SUM_PD_T(impl_name, ...) \
DECLARE_SUM_PD_t(impl_name, __VA_ARGS__)
Expand Down
4 changes: 4 additions & 0 deletions src/gpu/gpu_sum_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@

#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
#include "gpu/nvidia/cudnn_sum.hpp"
#include "gpu/sycl/ref_sum.hpp"
#include "gpu/sycl/ref_sum_many_inputs.hpp"
#endif

namespace dnnl {
Expand All @@ -49,6 +51,8 @@ constexpr impl_list_item_t impl_list[] = REG_SUM_P({
GPU_SUM_INSTANCE_INTEL(intel::ocl::simple_sum_t<data_type::f32>)
GPU_SUM_INSTANCE_INTEL(intel::ocl::ref_sum_t)
GPU_SUM_INSTANCE_NVIDIA(nvidia::cudnn_ref_sum_t)
GPU_SUM_INSTANCE_GENERIC_SYCL(sycl::ref_sum_t)
GPU_SUM_INSTANCE_GENERIC_SYCL(sycl::ref_sum_many_inputs_t)
nullptr,
});
// clang-format on
Expand Down
114 changes: 114 additions & 0 deletions src/gpu/sycl/ref_sum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/sycl/ref_sum.hpp"
#include "gpu/sycl/sum_kernels.hpp"
#include "gpu/sycl/sycl_gpu_primitive.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

using namespace impl::sycl;

status_t ref_sum_t::pd_t::init_conf() {
conf_ = sycl_sum_conf_t();
conf_.n = n_inputs();

for (auto i = 0; i < conf_.n; ++i) {
conf_.src_md[i] = xpu::sycl::md_t(src_md(i));
conf_.src_scales[i] = scales()[i];
}
conf_.dst_md = xpu::sycl::md_t(dst_md());

// XXX: should probably be tuned.
conf_.block_size = 16;
conf_.wg_size = 32;
conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
return status::success;
}

status_t ref_sum_t::init(engine_t *engine) {
const auto kid = ::sycl::get_kernel_id<sum_kernel_vec_t>();
CHECK(create_kernel(engine, kid, &kernel_));

return status::success;
}

status_t ref_sum_t::execute(const exec_ctx_t &ctx) const {
using namespace memory_tracking::names;

parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
auto src0_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 0);
auto src1_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 1);
auto src2_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 2);
auto src3_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 3);
auto src4_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 4);
auto src5_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 5);
auto src6_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 6);
auto src7_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 7);
auto src8_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 8);
auto src9_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 9);
auto src10_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 10);
auto src11_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 11);
auto src12_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 12);
auto src13_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 13);
auto src14_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 14);
auto src15_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 15);

auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);

sum_kernel_vec_t sum_kernel(pd()->conf_, src0_mem_arg, src1_mem_arg,
src2_mem_arg, src3_mem_arg, src4_mem_arg, src5_mem_arg,
src6_mem_arg, src7_mem_arg, src8_mem_arg, src9_mem_arg,
src10_mem_arg, src11_mem_arg, src12_mem_arg, src13_mem_arg,
src14_mem_arg, src15_mem_arg, dst_mem_arg);

const int block_size = pd()->conf_.block_size;
const int wg_size = pd()->conf_.wg_size;

const int t_work = pd()->conf_.wk_size;
const int wg_work = wg_size * block_size;
const int wg_cnt = utils::div_up(t_work, wg_work);

cgh.parallel_for(
::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), sum_kernel);
});

return status::success;
}

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl
96 changes: 96 additions & 0 deletions src/gpu/sycl/ref_sum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_SYCL_REF_SUM_HPP
#define GPU_SYCL_REF_SUM_HPP

#include "common/primitive.hpp"
#include "common/stream.hpp"
#include "gpu/gpu_sum_pd.hpp"
#include "gpu/sycl/sycl_gpu_primitive.hpp"
#include "gpu/sycl/sycl_io_helper.hpp"
#include "gpu/sycl/sycl_post_ops.hpp"
#include "gpu/sycl/sycl_primitive_conf.hpp"
#include "gpu/sycl/sycl_q10n.hpp"
#include "sycl/sycl_stream.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

struct ref_sum_t : public sycl_gpu_primitive_t {
using sycl_gpu_primitive_t::sycl_gpu_primitive_t;

struct pd_t : public gpu_sum_pd_t {
using gpu_sum_pd_t::gpu_sum_pd_t;

DECLARE_SUM_PD_T("dpcpp:ref:any", ref_sum_t);

status_t init(engine_t *engine) {
using namespace data_type;
using namespace format_tag;

const memory_desc_wrapper dst_d(dst_md());
if (!utils::one_of(dst_d.data_type(), f32, bf16, f16, s8, u8))
return status::unimplemented;
// Block formats are not yet supported
// Dimensions can not be > 6
if (!dst_d.is_plain() || dst_d.ndims() > MAX_NDIMS)
return status::unimplemented;

const int n = n_inputs();
for (auto i = 0; i < n; ++i) {
const memory_desc_wrapper src_d(src_md(i));
if (!utils::one_of(src_d.data_type(), f32, bf16, f16, s8, u8))
return status::unimplemented;
// Block formats are not yet supported
// Dimensions can not be > 6
if (!src_d.is_plain() || src_d.ndims() > MAX_NDIMS)
return status::unimplemented;
}

const bool ok = set_default_params() == status::success
&& n <= MAX_NUM_TENSORS;
if (!ok) return status::unimplemented;

return init_conf();
}

sycl_sum_conf_t conf_;

private:
status_t init_conf();

inline bool equal(float in_value, float in_compare_to) {
return std::fabs(in_value - in_compare_to) <= FLT_EPSILON;
}
};

status_t init(engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
intel::compute::kernel_t kernel_;
};

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
79 changes: 79 additions & 0 deletions src/gpu/sycl/ref_sum_many_inputs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/sycl/ref_sum_many_inputs.hpp"
#include "common/primitive_desc_iface.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

using namespace impl::sycl;

status_t ref_sum_many_inputs_t::pd_t::init_conf() {
conf_ = sycl_sum_conf_t();
conf_.n = n_inputs();

return status::success;
}

status_t ref_sum_many_inputs_t::init(engine_t *engine) {
const size_t n = pd()->base_pds_.size();
base_prims_.resize(n);
for (size_t i = 0; i < n; ++i) {
CHECK(pd()->base_pds_[i]->impl()->create_primitive(
base_prims_[i], engine, cache_blob()));
}

return status::success;
}

status_t ref_sum_many_inputs_t::execute(const exec_ctx_t &ctx) const {
memory_arg_t dst_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, false};
memory_arg_t dst_read_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, true};

int n_remaining = pd()->conf_.n;
int in_arg_offset = 0;
int i = 0;

while (n_remaining > 0) {
bool pass_in_dst = i > 0;
int max_n_child_inputs = MAX_NUM_TENSORS - pass_in_dst;
int args_handled = std::min(n_remaining, max_n_child_inputs);
exec_args_t r_args;
r_args[DNNL_ARG_DST] = dst_mem_arg;
if (pass_in_dst) {
r_args[DNNL_ARG_MULTIPLE_SRC + 0] = dst_read_mem_arg;
}
for (int j = 0; j < args_handled; j++) {
r_args[DNNL_ARG_MULTIPLE_SRC + j + pass_in_dst]
= ctx.args().at(DNNL_ARG_MULTIPLE_SRC + j + in_arg_offset);
}
n_remaining -= args_handled;
in_arg_offset += args_handled;
i++;

exec_ctx_t r_ctx(ctx, std::move(r_args));
CHECK(base_prims_[i]->execute(r_ctx));
}
return status::success;
}

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl
Loading

0 comments on commit 01454e3

Please sign in to comment.