Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cpu: aarch64: Enable ACL stateless API for indirect conv #2022

Merged
merged 1 commit into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#===============================================================================
# Copyright 2019-2021 Intel Corporation
# Copyright 2024 Arm Limited and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,4 +30,4 @@ compile_commands.json
.git-blame-ignore-revs
**/.DS_Store
__pycache__

.cache
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
machine learning applications and provides AArch64 optimized implementations
of core functions. This functionality currently requires that ACL is downloaded
and built separately. See [Build from Source] section of the Developer Guide for
details. oneDNN only supports Compute Library versions 24.04 or later.
details. oneDNN only supports Compute Library versions 24.07 or later.

[Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary

Expand Down
2 changes: 1 addition & 1 deletion cmake/ACL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ endif()

find_package(ACL REQUIRED)

set(ACL_MINIMUM_VERSION "24.04")
set(ACL_MINIMUM_VERSION "24.07")

if(ACL_FOUND)
file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
Expand Down
2 changes: 2 additions & 0 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ enum {
key_conv_gemm_zp_src_comp,
key_conv_int_dat_in_acc_dt,
key_conv_padded_bias,
key_conv_permuted_weights,
key_conv_rtus_space,
key_conv_store_wsp,
key_conv_tails,
Expand All @@ -225,6 +226,7 @@ enum {
key_gemm_blocked_a,
key_gemm_blocked_b,
key_gemm_accumulator,
key_gemm_pretranspose,
key_generic_acc,
key_gnorm_cvt,
key_gnorm_reduction,
Expand Down
37 changes: 0 additions & 37 deletions src/cpu/aarch64/acl_convolution_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,43 +310,6 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
return status::success;
}

status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {
if (weights_md.ndims != 4) return status::unimplemented;

// Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1
&& !everyone_is(data_type::bf16, src_md.data_type,
weights_md.data_type, dst_md.data_type))
return status::unimplemented;

CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));

// If we do not need to pad input channels for fast math mode then it would
// be faster to run convolution with im2row instead of using indirect kernel
int block_by = arm_compute::block_by(acp.weights_info.weight_format());
int ic = src_md.dims[1];
if (acp.fast_math && ic % block_by == 0) return status::unimplemented;

// clang-format off
// NOTE: indirect convolution method supports only nhwc layout.
ACL_CHECK_VALID(arm_compute::NEGEMMConv2d::validate(
&acp.src_tensor_info,
&acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info,
arm_compute::Conv2dInfo(acp.padstride_info,
acp.dilation_info,
acp.act_info,
acp.fast_math,
1, acp.weights_info)));
// clang-format on

return status::success;
}

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
Expand Down
110 changes: 109 additions & 1 deletion src/cpu/aarch64/acl_convolution_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct acl_obj_t {
arm_compute::Tensor wei_tensor;
arm_compute::Tensor bia_tensor;
arm_compute::Tensor dst_tensor;
arm_compute::experimental::MemoryRequirements aux_mem_req;
mgouicem marked this conversation as resolved.
Show resolved Hide resolved
};

struct acl_conv_conf_t {
Expand Down Expand Up @@ -65,7 +66,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);
Expand All @@ -81,6 +82,113 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
const primitive_attr_t &attr);
} // namespace acl_convolution_utils

// Keys are anonymous with local linkage. So deduce the type automagically.
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);

template <typename op_t, typename post_ops_t>
status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
const dnnl::impl::memory_desc_t &dst_md) {

// Book temp mem.
const auto aux_mem_req = conv.workspace();
for (const auto &key : conv_keys) {
const auto id = key.first;
if (aux_mem_req[id].size > 0) {
scratchpad.book(key.second, aux_mem_req[id].size, 1,
aux_mem_req[id].alignment, aux_mem_req[id].alignment);
}
}

CHECK(post_ops.init(engine, attr_post_ops, dst_md, act_info));
use_dst_acc_for_sum = post_ops.has_sum();

if (use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md);
scratchpad.book(memory_tracking::names::key_generic_acc, dst_d.nelems(),
dst_d.data_type_size());
}

return status::success;
}

template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
typename bia_data_t = src_data_t>
status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
const std::map<int, conv_key_t> &conv_keys) {

auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);

// import_memory() and free() methods do not allocate/free any additional
// memory, only acquire/release pointers.
arm_compute::Tensor src_tensor;
arm_compute::Tensor wei_tensor;
arm_compute::Tensor bia_tensor = nullptr;
arm_compute::Tensor dst_tensor;

auto const acp = pd->acp_;

src_tensor.allocator()->init(acp.src_tensor_info);
wei_tensor.allocator()->init(acp.wei_tensor_info);
dst_tensor.allocator()->init(acp.dst_tensor_info);

src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));

const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = acp.use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
dst_tensor.allocator()->import_memory(dst_base);

if (acp.with_bias) {
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
bia_tensor.allocator()->init(acp.bia_tensor_info);
bia_tensor.allocator()->import_memory(
const_cast<bia_data_t *>(bia_base));
}

arm_compute::ITensorPack pack
= {{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
{arm_compute::TensorType::ACL_SRC_1, &wei_tensor},
{arm_compute::TensorType::ACL_SRC_2, &bia_tensor},
{arm_compute::TensorType::ACL_DST, &dst_tensor}};

// Get temp workspaces.
const auto aux_mem = acl_conv_obj->aux_mem_req;

// Hold onto tmp tensors while we need pack.
std::vector<arm_compute::Tensor> tmp_tensors(aux_mem.size());
for (const auto &key : conv_keys) {
const auto id = key.first;
if (aux_mem[id].size > 0) {
const auto info = arm_compute::TensorInfo(
arm_compute::TensorShape(aux_mem[id].size), 1,
arm_compute::DataType::U8);
auto buffer = scratchpad.get<void>(key.second);
tmp_tensors[id].allocator()->init(info, aux_mem[id].alignment);
tmp_tensors[id].allocator()->import_memory(buffer);
pack.add_tensor(aux_mem[id].slot, &tmp_tensors[id]);
}
}

acl_conv_obj->conv.prepare(pack);
acl_conv_obj->conv.run(pack);

void *dst = dst_tensor.buffer();
pd->post_ops.execute(ctx, dst);

return status::success;
}

template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
typename bia_data_t = src_data_t>
Expand Down
116 changes: 102 additions & 14 deletions src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021-2022 Arm Ltd. and affiliates
* Copyright 2021-2022, 2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,27 +14,115 @@
* limitations under the License.
*******************************************************************************/

#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
#include "acl_indirect_gemm_convolution.hpp"
#include "acl_convolution_utils.hpp"
#include "common/memory_tracking.hpp"
#include "common/utils.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace aarch64 {

namespace {
// Keys are anonymous. So deduce the type automagically.
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);

// Map: [slot , key]
const std::map<int, conv_key_t> indirect_conv_keys
= {{0, conv_key_t::key_gemm_tmp_buffer},
{2, conv_key_t::key_gemm_pretranspose},
{3, conv_key_t::key_conv_permuted_weights}};
} // namespace

status_t acl_indirect_gemm_convolution_fwd_t::init(engine_t *engine) {
auto acp_ = pd()->acp_;
acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info,
arm_compute::Conv2dInfo(acp_.padstride_info, acp_.dilation_info,
acp_.act_info, acp_.fast_math, 1, acp_.weights_info));
acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}

status_t acl_indirect_gemm_convolution_fwd_t::execute_forward(
const exec_ctx_t &ctx) const {
// Lock here is needed because resource_mapper does not support
// concurrent multithreaded access.
std::lock_guard<std::mutex> _lock {this->mtx};
// Retrieve primitive resource and configured Compute Library objects
auto *acl_resource
= ctx.get_resource_mapper()->get<acl_indirect_gemm_resource_t>(
this);
acl_obj_t<arm_compute::NEGEMMConv2d> &acl_indirect_gemm_obj
= acl_resource->get_acl_obj();

return execute_forward_conv_acl<acl_obj_t<arm_compute::NEGEMMConv2d>, pd_t,
data_t>(ctx, acl_indirect_gemm_obj, pd());
return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
ctx, acl_obj_.get(), pd(), indirect_conv_keys);
}

status_t acl_indirect_gemm_convolution_fwd_t::create_resource(
engine_t *engine, resource_mapper_t &mapper) const {

CHECK(pd()->post_ops.create_resource(engine, mapper));
return status::success;
}

status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init_conf() {
if (weights_md_.ndims != 4) return status::unimplemented;

// Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
if (weights_md_.dims[2] == 1 && weights_md_.dims[3] == 1
&& !dnnl::impl::utils::everyone_is(data_type::bf16,
src_md_.data_type, weights_md_.data_type,
dst_md_.data_type))
return status::unimplemented;

CHECK(acl_convolution_utils::acl_init_conf(
acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));

// If we do not need to pad input channels for fast math mode then it would
// be faster to run convolution with im2row instead of using indirect kernel
int block_by = arm_compute::block_by(acp_.weights_info.weight_format());
int ic = src_md_.dims[1];
if (acp_.fast_math && ic % block_by == 0) return status::unimplemented;

// clang-format off
// NOTE: indirect convolution method supports only nhwc layout.
ACL_CHECK_VALID(Op::validate(
&acp_.src_tensor_info,
&acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info,
arm_compute::Conv2dInfo(acp_.padstride_info,
acp_.dilation_info,
acp_.act_info,
acp_.fast_math,
1, acp_.weights_info)));
// clang-format on

return status::success;
}

status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
using namespace data_type;
using smask_t = primitive_attr_t::skip_mask_t;

const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
&& attr()->has_default_values(smask_t::post_ops, f16);
const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
&& attr()->has_default_values(
smask_t::post_ops | smask_t::fpmath_mode, f32);
bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
&& utils::one_of(true, is_fp16_ok, is_fp32_ok)
&& !has_zero_dim_memory();
if (!ok) return status::unimplemented;

CHECK(init_conf());

// Book memory.
Op conv;
conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info,
arm_compute::Conv2dInfo(acp_.padstride_info, acp_.dilation_info,
acp_.act_info, acp_.fast_math, 1, acp_.weights_info));

auto scratchpad = scratchpad_registry().registrar();
return init_scratchpad(conv, scratchpad, indirect_conv_keys, engine,
post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
dst_md_);
}

} // namespace aarch64
Expand Down
Loading