gpu: ocl: reorder: support large buffers

oneapi-src · Mar 27, 2024 · 74a343b · 74a343b
1 parent bec487e
commit 74a343b
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 37 deletions.
diff --git a/src/gpu/ocl/generic_reorder.cl b/src/gpu/ocl/generic_reorder.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,14 +14,20 @@
 * limitations under the License.
 *******************************************************************************/
 
+#define USE_CUSTOM_GWS_GET_ID
+
 #include "gpu/ocl/dispatch.h"
 #include "gpu/ocl/reorder_common.h"
+#include "gpu/ocl/types_interop.h"
+
+#define GWS_GET_THREAD_ID(index) (get_global_id(index) + offset.array[index])
 
 KERNEL_ATTR
 __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
         __global DST_DATA_T *restrict dst, __global float *restrict src_scales,
         __global int *restrict src_zps, __global float *restrict dst_scales,
-        __global int *restrict dst_zps, float sum_scale, int sum_zp) {
+        __global int *restrict dst_zps, float sum_scale, int sum_zp,
+        int64x3_t offset) {
 
     const int src_zp = GET_SRC_ZP(src_zps);
     const int dst_zp = GET_DST_ZP(dst_zps);
@@ -33,8 +39,9 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
 
 #define LOOP_NEST_LEVEL 4
     const uint sgId = get_sub_group_local_id();
-    uint d[6]; // tensor coordinates from workitem ID
-    uint b[6] = {0, 0, 0, 0, 0, 0}; // ajustment to coordinates per block (loop)
+    off_t d[6]; // tensor coordinates from workitem ID
+    off_t b[6]
+            = {0, 0, 0, 0, 0, 0}; // ajustment to coordinates per block (loop)
 
     d[0] = GWS_GET_D0();
     d[1] = GWS_GET_D1();
@@ -103,7 +110,7 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
         // In a majority of cases, src_off contains neighboring values for
         // neighboring workitems. Exceptions occur when the src tensor has
         // additional striding in some dimension(s).
-        const uint src_off = SRC_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
+        const off_t src_off = SRC_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
                 d[3] + b[3], d[4] + b[4], d[5] + b[5]);
 
         // Data in cache (local mem) is organized as if it had 'fedcba' format
@@ -172,7 +179,7 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
         // neighboring workitems. Exceptions occur when the dst tensor has
         // additional striding in some dimension(s), e.g., when reorder is used
         // in reference concat.
-        const uint dst_off = DST_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
+        const off_t dst_off = DST_OFF(d[0] + b[0], d[1] + b[1], d[2] + b[2],
                 d[3] + b[3], d[4] + b[4], d[5] + b[5]);
 
         DST_DATA_T dst_tmp;
@@ -197,14 +204,14 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
             dst_tmp = dst[dst_off];
 #endif
 #if WITH_SRC_SCALE
-            uint src_scale_idx = SCALE_OFF(SRC, d[0] + b[0], d[1] + b[1],
+            off_t src_scale_idx = SCALE_OFF(SRC, d[0] + b[0], d[1] + b[1],
                     d[2] + b[2], d[3] + b[3], d[4] + b[4], d[5] + b[5]);
             src_scale = src_scale_idx < SRC_NUM_SCALES
                     ? src_scales[src_scale_idx]
                     : 0.0;
 #endif
 #if WITH_DST_SCALE
-            uint dst_scale_idx = SCALE_OFF(DST, d[0] + b[0], d[1] + b[1],
+            off_t dst_scale_idx = SCALE_OFF(DST, d[0] + b[0], d[1] + b[1],
                     d[2] + b[2], d[3] + b[3], d[4] + b[4], d[5] + b[5]);
             dst_scale = dst_scale_idx < DST_NUM_SCALES
                     ? dst_scales[dst_scale_idx]

diff --git a/src/gpu/ocl/generic_reorder.cpp b/src/gpu/ocl/generic_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -992,8 +992,7 @@ status_t generic_reorder_t::execute(const exec_ctx_t &ctx) const {
 
     auto nd_range = conf.dispatch.nd_range();
 
-    status = parallel_for(ctx, nd_range, kernel_, arg_list);
-    return status;
+    return large_parallel_for(ctx, nd_range, kernel_, arg_list, 8);
 }
 
 } // namespace ocl

diff --git a/src/gpu/ocl/ref_reorder.cl b/src/gpu/ocl/ref_reorder.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,14 +14,21 @@
 * limitations under the License.
 *******************************************************************************/
 
+#define USE_CUSTOM_GWS_GET_ID
+
 #include "gpu/ocl/dispatch.h"
 #include "gpu/ocl/reorder_common.h"
+#include "gpu/ocl/types_interop.h"
+
+#define TO_I4 ((DST_DT_U4 || DST_DT_S4) && (!SRC_DT_U4 && !SRC_DT_S4))
+#define FROM_I4 ((SRC_DT_U4 || SRC_DT_S4) && (!DST_DT_U4 && !DST_DT_S4))
+#define GWS_GET_THREAD_ID(index) (get_global_id(index) + offset.array[index])
 
 KERNEL_ATTR
 __kernel void ref_reorder(__global SRC_DATA_T *restrict src,
         __global DST_DATA_T *restrict dst, __global float *restrict src_scales,
         __global int *restrict src_zps, __global float *restrict dst_scales,
-        __global int *dst_zps, float sum_scale, int sum_zp) {
+        __global int *dst_zps, float sum_scale, int sum_zp, int64x3_t offset) {
 
     const int src_zp = GET_SRC_ZP(src_zps);
     const int dst_zp = GET_DST_ZP(dst_zps);
@@ -31,28 +38,28 @@ __kernel void ref_reorder(__global SRC_DATA_T *restrict src,
     src += SRC_OFFSET0;
     dst += DST_OFFSET0;
 
-    const int d0_blk_start = GWS_GET_D0();
-    const int d1_blk_start = GWS_GET_D1();
-    const int d2_blk_start = GWS_GET_D2();
-    const int d3_blk_start = GWS_GET_D3();
-    const int d4_blk_start = GWS_GET_D4();
-    const int d5_blk_start = GWS_GET_D5();
+    const off_t d0_blk_start = GWS_GET_D0();
+    const off_t d1_blk_start = GWS_GET_D1();
+    const off_t d2_blk_start = GWS_GET_D2();
+    const off_t d3_blk_start = GWS_GET_D3();
+    const off_t d4_blk_start = GWS_GET_D4();
+    const off_t d5_blk_start = GWS_GET_D5();
 
-    const int d0_blk_end = d0_blk_start + GWS_GET_D0_BLOCK();
-    const int d1_blk_end = d1_blk_start + GWS_GET_D1_BLOCK();
-    const int d2_blk_end = d2_blk_start + GWS_GET_D2_BLOCK();
-    const int d3_blk_end = d3_blk_start + GWS_GET_D3_BLOCK();
-    const int d4_blk_end = d4_blk_start + GWS_GET_D4_BLOCK();
-    const int d5_blk_end = d5_blk_start + GWS_GET_D5_BLOCK();
+    const off_t d0_blk_end = d0_blk_start + GWS_GET_D0_BLOCK();
+    const off_t d1_blk_end = d1_blk_start + GWS_GET_D1_BLOCK();
+    const off_t d2_blk_end = d2_blk_start + GWS_GET_D2_BLOCK();
+    const off_t d3_blk_end = d3_blk_start + GWS_GET_D3_BLOCK();
+    const off_t d4_blk_end = d4_blk_start + GWS_GET_D4_BLOCK();
+    const off_t d5_blk_end = d5_blk_start + GWS_GET_D5_BLOCK();
 
-    for_(int d0 = d0_blk_start; d0 < d0_blk_end; ++d0)
-    for_(int d1 = d1_blk_start; d1 < d1_blk_end; ++d1)
-    for_(int d2 = d2_blk_start; d2 < d2_blk_end; ++d2)
-    for_(int d3 = d3_blk_start; d3 < d3_blk_end; ++d3)
-    for_(int d4 = d4_blk_start; d4 < d4_blk_end; ++d4)
-    for (int d5 = d5_blk_start; d5 < d5_blk_end; ++d5) {
-        const int src_off = SRC_OFF(d0, d1, d2, d3, d4, d5);
-        const int dst_off = DST_OFF(d0, d1, d2, d3, d4, d5);
+    for_(off_t d0 = d0_blk_start; d0 < d0_blk_end; ++d0)
+    for_(off_t d1 = d1_blk_start; d1 < d1_blk_end; ++d1)
+    for_(off_t d2 = d2_blk_start; d2 < d2_blk_end; ++d2)
+    for_(off_t d3 = d3_blk_start; d3 < d3_blk_end; ++d3)
+    for_(off_t d4 = d4_blk_start; d4 < d4_blk_end; ++d4)
+    for (off_t d5 = d5_blk_start; d5 < d5_blk_end; ++d5) {
+        const off_t src_off = SRC_OFF(d0, d1, d2, d3, d4, d5);
+        const off_t dst_off = DST_OFF(d0, d1, d2, d3, d4, d5);
 #if PAD_FILL_ZERO == 1
         int pad_d0 = d0 >= SRC_D0;
         int pad_d1 = NDIMS > 1 && d1 >= SRC_D1;

diff --git a/src/gpu/ocl/ref_reorder.cpp b/src/gpu/ocl/ref_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -141,9 +141,7 @@ status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
 
     auto nd_range = conf.dispatch.nd_range();
 
-    status = parallel_for(ctx, nd_range, kernel_, arg_list);
-
-    return status;
+    return large_parallel_for(ctx, nd_range, kernel_, arg_list, 8);
 }
 
 } // namespace ocl