gpu: sycl: sum: implemented

oneapi-src · May 31, 2024 · 01454e3 · 01454e3
1 parent 95c6fcd
commit 01454e3
Show file tree

Hide file tree

Showing 10 changed files with 611 additions and 2 deletions.
diff --git a/src/common/dnnl_traits.hpp b/src/common/dnnl_traits.hpp
@@ -174,6 +174,7 @@ PKIND_TRAITS_INST(binary);
 PKIND_TRAITS_INST(matmul);
 PKIND_TRAITS_INST(resampling);
 PKIND_TRAITS_INST(reduction);
+PKIND_TRAITS_INST(sum);
 PKIND_TRAITS_INST(sdpa);
 #undef PKIND_TRAITS_INST
 

diff --git a/src/common/sum_pd.hpp b/src/common/sum_pd.hpp
@@ -91,13 +91,30 @@ struct sum_pd_t : public primitive_desc_t {
     }
 
 protected:
+    sum_desc_t desc_;
+
     int n_;
     std::vector<float> scales_;
     memory_desc_t dst_md_, dst_acc_md_;
     std::vector<memory_desc_t> src_mds_;
     memory_desc_t original_dst_md_;
 
-    sum_desc_t desc_;
+    sum_pd_t(const sum_desc_t *adesc, const primitive_attr_t *attr,
+            const sum_pd_t *hint_fwd)
+        : primitive_desc_t(attr, primitive_kind::sum)
+        , desc_(*adesc)
+        , n_(desc_.n)
+        , dst_md_(*desc_.dst_md)
+        , original_dst_md_(*desc_.dst_md) {
+        scales_.reserve(n_);
+        for (int i = 0; i < n_; ++i)
+            scales_.push_back(desc_.scales[i]);
+        src_mds_.reserve(n_);
+        for (int i = 0; i < n_; ++i)
+            src_mds_.push_back(*desc_.src_mds[i]);
+
+        init_desc();
+    }
 
     sum_pd_t(const primitive_attr_t *attr, const memory_desc_t *dst_md, int n,
             const float *scales, const memory_desc_t *const *src_mds)
@@ -218,7 +235,9 @@ struct sum_pd_t : public primitive_desc_t {
         if (!new_pd->is_initialized()) return nullptr; \
         return new_pd.release(); \
     } \
-    const char *name() const override { return impl_name; }
+    const char *name() const override { \
+        return impl_name; \
+    }
 
 #define DECLARE_SUM_PD_T(impl_name, ...) \
     DECLARE_SUM_PD_t(impl_name, __VA_ARGS__)

diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp
@@ -30,6 +30,8 @@
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/nvidia/cudnn_sum.hpp"
+#include "gpu/sycl/ref_sum.hpp"
+#include "gpu/sycl/ref_sum_many_inputs.hpp"
 #endif
 
 namespace dnnl {
@@ -49,6 +51,8 @@ constexpr impl_list_item_t impl_list[] = REG_SUM_P({
         GPU_SUM_INSTANCE_INTEL(intel::ocl::simple_sum_t<data_type::f32>)
         GPU_SUM_INSTANCE_INTEL(intel::ocl::ref_sum_t)
         GPU_SUM_INSTANCE_NVIDIA(nvidia::cudnn_ref_sum_t)
+        GPU_SUM_INSTANCE_GENERIC_SYCL(sycl::ref_sum_t)
+        GPU_SUM_INSTANCE_GENERIC_SYCL(sycl::ref_sum_many_inputs_t)
         nullptr,
 });
 // clang-format on

diff --git a/src/gpu/sycl/ref_sum.cpp b/src/gpu/sycl/ref_sum.cpp
@@ -0,0 +1,114 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/sycl/ref_sum.hpp"
+#include "gpu/sycl/sum_kernels.hpp"
+#include "gpu/sycl/sycl_gpu_primitive.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace sycl {
+
+using namespace impl::sycl;
+
+status_t ref_sum_t::pd_t::init_conf() {
+    conf_ = sycl_sum_conf_t();
+    conf_.n = n_inputs();
+
+    for (auto i = 0; i < conf_.n; ++i) {
+        conf_.src_md[i] = xpu::sycl::md_t(src_md(i));
+        conf_.src_scales[i] = scales()[i];
+    }
+    conf_.dst_md = xpu::sycl::md_t(dst_md());
+
+    // XXX: should probably be tuned.
+    conf_.block_size = 16;
+    conf_.wg_size = 32;
+    conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
+    return status::success;
+}
+
+status_t ref_sum_t::init(engine_t *engine) {
+    const auto kid = ::sycl::get_kernel_id<sum_kernel_vec_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+
+    return status::success;
+}
+
+status_t ref_sum_t::execute(const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+
+    parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        auto src0_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 0);
+        auto src1_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 1);
+        auto src2_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 2);
+        auto src3_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 3);
+        auto src4_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 4);
+        auto src5_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 5);
+        auto src6_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 6);
+        auto src7_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 7);
+        auto src8_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 8);
+        auto src9_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 9);
+        auto src10_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 10);
+        auto src11_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 11);
+        auto src12_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 12);
+        auto src13_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 13);
+        auto src14_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 14);
+        auto src15_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 15);
+
+        auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
+
+        sum_kernel_vec_t sum_kernel(pd()->conf_, src0_mem_arg, src1_mem_arg,
+                src2_mem_arg, src3_mem_arg, src4_mem_arg, src5_mem_arg,
+                src6_mem_arg, src7_mem_arg, src8_mem_arg, src9_mem_arg,
+                src10_mem_arg, src11_mem_arg, src12_mem_arg, src13_mem_arg,
+                src14_mem_arg, src15_mem_arg, dst_mem_arg);
+
+        const int block_size = pd()->conf_.block_size;
+        const int wg_size = pd()->conf_.wg_size;
+
+        const int t_work = pd()->conf_.wk_size;
+        const int wg_work = wg_size * block_size;
+        const int wg_cnt = utils::div_up(t_work, wg_work);
+
+        cgh.parallel_for(
+                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), sum_kernel);
+    });
+
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/sycl/ref_sum.hpp b/src/gpu/sycl/ref_sum.hpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_SYCL_REF_SUM_HPP
+#define GPU_SYCL_REF_SUM_HPP
+
+#include "common/primitive.hpp"
+#include "common/stream.hpp"
+#include "gpu/gpu_sum_pd.hpp"
+#include "gpu/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/sycl/sycl_io_helper.hpp"
+#include "gpu/sycl/sycl_post_ops.hpp"
+#include "gpu/sycl/sycl_primitive_conf.hpp"
+#include "gpu/sycl/sycl_q10n.hpp"
+#include "sycl/sycl_stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace sycl {
+
+struct ref_sum_t : public sycl_gpu_primitive_t {
+    using sycl_gpu_primitive_t::sycl_gpu_primitive_t;
+
+    struct pd_t : public gpu_sum_pd_t {
+        using gpu_sum_pd_t::gpu_sum_pd_t;
+
+        DECLARE_SUM_PD_T("dpcpp:ref:any", ref_sum_t);
+
+        status_t init(engine_t *engine) {
+            using namespace data_type;
+            using namespace format_tag;
+
+            const memory_desc_wrapper dst_d(dst_md());
+            if (!utils::one_of(dst_d.data_type(), f32, bf16, f16, s8, u8))
+                return status::unimplemented;
+            // Block formats are not yet supported
+            // Dimensions can not be > 6
+            if (!dst_d.is_plain() || dst_d.ndims() > MAX_NDIMS)
+                return status::unimplemented;
+
+            const int n = n_inputs();
+            for (auto i = 0; i < n; ++i) {
+                const memory_desc_wrapper src_d(src_md(i));
+                if (!utils::one_of(src_d.data_type(), f32, bf16, f16, s8, u8))
+                    return status::unimplemented;
+                // Block formats are not yet supported
+                // Dimensions can not be > 6
+                if (!src_d.is_plain() || src_d.ndims() > MAX_NDIMS)
+                    return status::unimplemented;
+            }
+
+            const bool ok = set_default_params() == status::success
+                    && n <= MAX_NUM_TENSORS;
+            if (!ok) return status::unimplemented;
+
+            return init_conf();
+        }
+
+        sycl_sum_conf_t conf_;
+
+    private:
+        status_t init_conf();
+
+        inline bool equal(float in_value, float in_compare_to) {
+            return std::fabs(in_value - in_compare_to) <= FLT_EPSILON;
+        }
+    };
+
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    intel::compute::kernel_t kernel_;
+};
+
+} // namespace sycl
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/sycl/ref_sum_many_inputs.cpp b/src/gpu/sycl/ref_sum_many_inputs.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/sycl/ref_sum_many_inputs.hpp"
+#include "common/primitive_desc_iface.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace sycl {
+
+using namespace impl::sycl;
+
+status_t ref_sum_many_inputs_t::pd_t::init_conf() {
+    conf_ = sycl_sum_conf_t();
+    conf_.n = n_inputs();
+
+    return status::success;
+}
+
+status_t ref_sum_many_inputs_t::init(engine_t *engine) {
+    const size_t n = pd()->base_pds_.size();
+    base_prims_.resize(n);
+    for (size_t i = 0; i < n; ++i) {
+        CHECK(pd()->base_pds_[i]->impl()->create_primitive(
+                base_prims_[i], engine, cache_blob()));
+    }
+
+    return status::success;
+}
+
+status_t ref_sum_many_inputs_t::execute(const exec_ctx_t &ctx) const {
+    memory_arg_t dst_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, false};
+    memory_arg_t dst_read_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, true};
+
+    int n_remaining = pd()->conf_.n;
+    int in_arg_offset = 0;
+    int i = 0;
+
+    while (n_remaining > 0) {
+        bool pass_in_dst = i > 0;
+        int max_n_child_inputs = MAX_NUM_TENSORS - pass_in_dst;
+        int args_handled = std::min(n_remaining, max_n_child_inputs);
+        exec_args_t r_args;
+        r_args[DNNL_ARG_DST] = dst_mem_arg;
+        if (pass_in_dst) {
+            r_args[DNNL_ARG_MULTIPLE_SRC + 0] = dst_read_mem_arg;
+        }
+        for (int j = 0; j < args_handled; j++) {
+            r_args[DNNL_ARG_MULTIPLE_SRC + j + pass_in_dst]
+                    = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + j + in_arg_offset);
+        }
+        n_remaining -= args_handled;
+        in_arg_offset += args_handled;
+        i++;
+
+        exec_ctx_t r_ctx(ctx, std::move(r_args));
+        CHECK(base_prims_[i]->execute(r_ctx));
+    }
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl