Skip to content

Commit

Permalink
gpu: ocl: reorder: support large buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
atkassen authored and karturov committed Mar 27, 2024
1 parent bec487e commit 74a343b
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 37 deletions.
23 changes: 15 additions & 8 deletions src/gpu/ocl/generic_reorder.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021-2023 Intel Corporation
* Copyright 2021-2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,14 +14,20 @@
* limitations under the License.
*******************************************************************************/

#define USE_CUSTOM_GWS_GET_ID

#include "gpu/ocl/dispatch.h"
#include "gpu/ocl/reorder_common.h"
#include "gpu/ocl/types_interop.h"

#define GWS_GET_THREAD_ID(index) (get_global_id(index) + offset.array[index])

KERNEL_ATTR
__kernel void generic_reorder(__global SRC_DATA_T *restrict src,
__global DST_DATA_T *restrict dst, __global float *restrict src_scales,
__global int *restrict src_zps, __global float *restrict dst_scales,
__global int *restrict dst_zps, float sum_scale, int sum_zp) {
__global int *restrict dst_zps, float sum_scale, int sum_zp,
int64x3_t offset) {

const int src_zp = GET_SRC_ZP(src_zps);
const int dst_zp = GET_DST_ZP(dst_zps);
Expand All @@ -33,8 +39,9 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,

#define LOOP_NEST_LEVEL 4
const uint sgId = get_sub_group_local_id();
uint d[6]; // tensor coordinates from workitem ID
uint b[6] = {0, 0, 0, 0, 0, 0}; // ajustment to coordinates per block (loop)
off_t d[6]; // tensor coordinates from workitem ID
off_t b[6]
= {0, 0, 0, 0, 0, 0}; // ajustment to coordinates per block (loop)

d[0] = GWS_GET_D0();
d[1] = GWS_GET_D1();
Expand Down Expand Up @@ -103,7 +110,7 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
// In a majority of cases, src_off contains neighboring values for
// neighboring workitems. Exceptions occur when the src tensor has
// additional striding in some dimension(s).
const uint src_off = SRC_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
const off_t src_off = SRC_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
d[3] + b[3], d[4] + b[4], d[5] + b[5]);

// Data in cache (local mem) is organized as if it had 'fedcba' format
Expand Down Expand Up @@ -172,7 +179,7 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
// neighboring workitems. Exceptions occur when the dst tensor has
// additional striding in some dimension(s), e.g., when reorder is used
// in reference concat.
const uint dst_off = DST_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
const off_t dst_off = DST_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
d[3] + b[3], d[4] + b[4], d[5] + b[5]);

DST_DATA_T dst_tmp;
Expand All @@ -197,14 +204,14 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
dst_tmp = dst[dst_off];
#endif
#if WITH_SRC_SCALE
uint src_scale_idx = SCALE_OFF(SRC, d[0] + b[0], d[1] + b[1],
off_t src_scale_idx = SCALE_OFF(SRC, d[0] + b[0], d[1] + b[1],
d[2] + b[2], d[3] + b[3], d[4] + b[4], d[5] + b[5]);
src_scale = src_scale_idx < SRC_NUM_SCALES
? src_scales[src_scale_idx]
: 0.0;
#endif
#if WITH_DST_SCALE
uint dst_scale_idx = SCALE_OFF(DST, d[0] + b[0], d[1] + b[1],
off_t dst_scale_idx = SCALE_OFF(DST, d[0] + b[0], d[1] + b[1],
d[2] + b[2], d[3] + b[3], d[4] + b[4], d[5] + b[5]);
dst_scale = dst_scale_idx < DST_NUM_SCALES
? dst_scales[dst_scale_idx]
Expand Down
5 changes: 2 additions & 3 deletions src/gpu/ocl/generic_reorder.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021-2023 Intel Corporation
* Copyright 2021-2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -992,8 +992,7 @@ status_t generic_reorder_t::execute(const exec_ctx_t &ctx) const {

auto nd_range = conf.dispatch.nd_range();

status = parallel_for(ctx, nd_range, kernel_, arg_list);
return status;
return large_parallel_for(ctx, nd_range, kernel_, arg_list, 8);
}

} // namespace ocl
Expand Down
51 changes: 29 additions & 22 deletions src/gpu/ocl/ref_reorder.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2019-2023 Intel Corporation
* Copyright 2019-2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,14 +14,21 @@
* limitations under the License.
*******************************************************************************/

#define USE_CUSTOM_GWS_GET_ID

#include "gpu/ocl/dispatch.h"
#include "gpu/ocl/reorder_common.h"
#include "gpu/ocl/types_interop.h"

#define TO_I4 ((DST_DT_U4 || DST_DT_S4) && (!SRC_DT_U4 && !SRC_DT_S4))
#define FROM_I4 ((SRC_DT_U4 || SRC_DT_S4) && (!DST_DT_U4 && !DST_DT_S4))
#define GWS_GET_THREAD_ID(index) (get_global_id(index) + offset.array[index])

KERNEL_ATTR
__kernel void ref_reorder(__global SRC_DATA_T *restrict src,
__global DST_DATA_T *restrict dst, __global float *restrict src_scales,
__global int *restrict src_zps, __global float *restrict dst_scales,
__global int *dst_zps, float sum_scale, int sum_zp) {
__global int *dst_zps, float sum_scale, int sum_zp, int64x3_t offset) {

const int src_zp = GET_SRC_ZP(src_zps);
const int dst_zp = GET_DST_ZP(dst_zps);
Expand All @@ -31,28 +38,28 @@ __kernel void ref_reorder(__global SRC_DATA_T *restrict src,
src += SRC_OFFSET0;
dst += DST_OFFSET0;

const int d0_blk_start = GWS_GET_D0();
const int d1_blk_start = GWS_GET_D1();
const int d2_blk_start = GWS_GET_D2();
const int d3_blk_start = GWS_GET_D3();
const int d4_blk_start = GWS_GET_D4();
const int d5_blk_start = GWS_GET_D5();
const off_t d0_blk_start = GWS_GET_D0();
const off_t d1_blk_start = GWS_GET_D1();
const off_t d2_blk_start = GWS_GET_D2();
const off_t d3_blk_start = GWS_GET_D3();
const off_t d4_blk_start = GWS_GET_D4();
const off_t d5_blk_start = GWS_GET_D5();

const int d0_blk_end = d0_blk_start + GWS_GET_D0_BLOCK();
const int d1_blk_end = d1_blk_start + GWS_GET_D1_BLOCK();
const int d2_blk_end = d2_blk_start + GWS_GET_D2_BLOCK();
const int d3_blk_end = d3_blk_start + GWS_GET_D3_BLOCK();
const int d4_blk_end = d4_blk_start + GWS_GET_D4_BLOCK();
const int d5_blk_end = d5_blk_start + GWS_GET_D5_BLOCK();
const off_t d0_blk_end = d0_blk_start + GWS_GET_D0_BLOCK();
const off_t d1_blk_end = d1_blk_start + GWS_GET_D1_BLOCK();
const off_t d2_blk_end = d2_blk_start + GWS_GET_D2_BLOCK();
const off_t d3_blk_end = d3_blk_start + GWS_GET_D3_BLOCK();
const off_t d4_blk_end = d4_blk_start + GWS_GET_D4_BLOCK();
const off_t d5_blk_end = d5_blk_start + GWS_GET_D5_BLOCK();

for_(int d0 = d0_blk_start; d0 < d0_blk_end; ++d0)
for_(int d1 = d1_blk_start; d1 < d1_blk_end; ++d1)
for_(int d2 = d2_blk_start; d2 < d2_blk_end; ++d2)
for_(int d3 = d3_blk_start; d3 < d3_blk_end; ++d3)
for_(int d4 = d4_blk_start; d4 < d4_blk_end; ++d4)
for (int d5 = d5_blk_start; d5 < d5_blk_end; ++d5) {
const int src_off = SRC_OFF(d0, d1, d2, d3, d4, d5);
const int dst_off = DST_OFF(d0, d1, d2, d3, d4, d5);
for_(off_t d0 = d0_blk_start; d0 < d0_blk_end; ++d0)
for_(off_t d1 = d1_blk_start; d1 < d1_blk_end; ++d1)
for_(off_t d2 = d2_blk_start; d2 < d2_blk_end; ++d2)
for_(off_t d3 = d3_blk_start; d3 < d3_blk_end; ++d3)
for_(off_t d4 = d4_blk_start; d4 < d4_blk_end; ++d4)
for (off_t d5 = d5_blk_start; d5 < d5_blk_end; ++d5) {
const off_t src_off = SRC_OFF(d0, d1, d2, d3, d4, d5);
const off_t dst_off = DST_OFF(d0, d1, d2, d3, d4, d5);
#if PAD_FILL_ZERO == 1
int pad_d0 = d0 >= SRC_D0;
int pad_d1 = NDIMS > 1 && d1 >= SRC_D1;
Expand Down
6 changes: 2 additions & 4 deletions src/gpu/ocl/ref_reorder.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2019-2023 Intel Corporation
* Copyright 2019-2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -141,9 +141,7 @@ status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {

auto nd_range = conf.dispatch.nd_range();

status = parallel_for(ctx, nd_range, kernel_, arg_list);

return status;
return large_parallel_for(ctx, nd_range, kernel_, arg_list, 8);
}

} // namespace ocl
Expand Down

0 comments on commit 74a343b

Please sign in to comment.