From 3394f3e3df7fd2d924c41822a8564493fc06473a Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 16 Sep 2022 14:14:21 +0100
Subject: Rework direct convolution heuristic on OpenCL

Resolves COMPMID-5634

Change-Id: I075de70d509d0c4430b4bcf3f218384e237a3a56
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/453708
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8473
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
---
 src/core/CL/DefaultLWSHeuristics.cpp               |  9 ++-
 src/core/CL/cl_kernels/nhwc/direct_convolution.cl  | 66 ++++++++++++----------
 src/core/CL/cl_kernels/tile_helpers.h              | 11 ++++
 src/gpu/cl/kernels/ClDirectConv2dKernel.cpp        |  6 ++
 .../ClDirectConvDefaultConfigValhall.cpp           | 64 +++++++++++++++++----
 5 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index c739b9dc03..a53fdbbab6 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -61,7 +61,14 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z)
 
     if(gws_x < gws_y)
     {
-        return cl::NDRange(4, 16, 1);
+        if(gws_x < 4)
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
+        }
+        else
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(4u)), 8, 1);
+        }
     }
     else
     {
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index e602fbb525..2e7ed5a4ca 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -116,12 +116,12 @@ __kernel void direct_convolution_nhwc(
     // In case of dynamic tensor support, the following dimensions should be passed as function argument.
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH src_w
-#define _ISRC_HEIGHT src_h
-#define _ISRC_CHANNELS src_c
-#define _IDST_WIDTH dst_w
-#define _IDST_HEIGHT dst_h
-#define _IDST_CHANNELS dst_c
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
 
     // If quantized, the output tile has to be quantized first before being stored to global memory
@@ -159,12 +159,25 @@ __kernel void direct_convolution_nhwc(
 
     for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
     {
-        int ck = 0;
         int xk = i % _IWEI_WIDTH;
         int yk = i / _IWEI_WIDTH;
 
-        int k = 0;
-        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        TILE(int, M0, 1, my);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            int x_s = xi[i].v + xk;
+            int y_s = yi[i].v + yk;
+            my[i].v = x_s + y_s *_ISRC_WIDTH;
+            my[i].v = my[i].v + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+            my[i].v = select(-1, my[i].v, x_s >= 0);
+            my[i].v = select(-1, my[i].v, x_s < _ISRC_WIDTH);
+            my[i].v = select(-1, my[i].v, y_s >= 0);
+            my[i].v = select(-1, my[i].v, y_s < _ISRC_HEIGHT);
+        })
+
+        int ck = 0;
+        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
         {
             TILE(SRC_DATA_TYPE, M0, K0, a);
             TILE(WEI_DATA_TYPE, N0, K0, b);
@@ -175,13 +188,8 @@ __kernel void direct_convolution_nhwc(
                 a[i].v = ZERO_VALUE;
             })
 
-            LOOP_UNROLLING(int, i, 0, 1, N0,
-            {
-                b[i].v = ZERO_VALUE;
-            })
-
             // Load tile from the src tensor
-            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, my, a);
 
             // Load tile from the weights tensor
             T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
@@ -192,15 +200,13 @@ __kernel void direct_convolution_nhwc(
             // Apply the offset correction (correction usually needed for asymmetric quantized computation)
             // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
             T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ck += K0;
         }
 
         // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
         // This #if directive should be removed in case of dynamic tensor support
 #if defined(LEFTOVER_LOOP)
         // Left-over accumulations
-        for(; k < _ISRC_CHANNELS; ++k)
+        for(; ck < _ISRC_CHANNELS; ++ck)
         {
             TILE(SRC_DATA_TYPE, M0, 1, a);
             TILE(WEI_DATA_TYPE, N0, 1, b);
@@ -229,8 +235,6 @@ __kernel void direct_convolution_nhwc(
             // Apply the offset correction (operation usually needed for asymmetric quantized computation)
             // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
             T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ++ck;
         }
 #endif // defined(LEFTOVER_LOOP)
     }
@@ -249,17 +253,6 @@ __kernel void direct_convolution_nhwc(
 
 #endif // HAS_BIAS
 
-    TILE(uint, M0, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
-        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-    })
-
-    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
 #if defined(IS_QUANTIZED)
 
     TILE(DST_DATA_TYPE, M0, N0, cq);
@@ -271,6 +264,17 @@ __kernel void direct_convolution_nhwc(
     // Apply activation
     T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
 
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
     // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
     // Store the tile in reverse order so the invalid values are overwritten with the valid ones
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 4b6144a22d..6279fb4fb6 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -653,6 +653,17 @@
         })                                                                                                                                                            \
     })
 
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, yi, dst)                \
+    ({                                                                                                                                                                \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
+        {                                                                                                                                                             \
+            if(yi[_i].v >= 0)                                                                                                                                     \
+            {                                                                                                                                                         \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[_i].v, STRIDE_Y);                                                               \
+            }                                                                                                                                                         \
+        })                                                                                                                                                            \
+    })
+
 /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
  *
  * @param[in]  DATA_TYPE     Data type
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index fd14f009e1..781627117a 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -242,6 +242,12 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
 
         build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
+        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
+        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(2)));
+        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
+        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
+        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
         build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
         build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
diff --git a/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp
index ad94678335..b693568c67 100644
--- a/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp
+++ b/src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -144,32 +144,45 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
 
         const int32_t ofm          = dst_shape[0];
         const int32_t m            = dst_shape[1] * dst_shape[2];
+        const int32_t k            = wei_shape[0];
         const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
         if(dst_shape[0] <= 4)
         {
+            // k0 should be as larger as possible. However, we should avoid
+            // having left-over for loops that make the implementation slower.
+            if((k % 16) == 0)
+            {
+                desc.k0 = 16;
+            }
+            else if((k % 8) == 0)
+            {
+                desc.k0 = 8;
+            }
+            else
+            {
+                desc.k0 = 4;
+            }
+
             if(is_pointwise)
             {
                 if(ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
-                    desc.k0 = 16;
                 }
                 else
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
-                    desc.k0 = 16;
                 }
             }
             else
             {
                 desc.m0 = 1;
                 desc.n0 = dst_shape[0];
-                desc.k0 = 16;
             }
         }
         else
@@ -178,21 +191,50 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
-                desc.k0 = 16;
+                if((k % 16) == 0)
+                {
+                    desc.k0 = 16;
+                }
+                else if((k % 8) == 0)
+                {
+                    desc.k0 = 8;
+                }
+                else
+                {
+                    desc.k0 = 4;
+                }
             }
             else
             {
-                if(ofm > 16)
+                if(ofm >= 16)
                 {
-                    desc.m0 = 4;
-                    desc.n0 = 4;
-                    desc.k0 = 8;
+                    if(m / 6 > 24000)
+                    {
+                        desc.m0 = 6;
+                    }
+                    else
+                    {
+                        desc.m0 = 5;
+                    }
+                    desc.n0 = 8;
+                    desc.k0 = 4;
                 }
                 else
                 {
-                    desc.m0 = 4;
-                    desc.n0 = 4;
-                    desc.k0 = 16;
+                    desc.m0 = 2;
+                    desc.n0 = 8;
+                    if((k % 16) == 0)
+                    {
+                        desc.k0 = 16;
+                    }
+                    else if((k % 8) == 0)
+                    {
+                        desc.k0 = 8;
+                    }
+                    else
+                    {
+                        desc.k0 = 4;
+                    }
                 }
             }
         }
-- 
cgit v1.2.1