From 7bc1a778520f3a8477d88819faeaca8ff77859fa Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Wed, 8 Sep 2021 17:14:19 +0100
Subject: Optimize ClScaleKernel on NHWC (f32/f16/int8)

The new kernel performs the computation on multiples elements. The
OpenCL kernel has been re-implemented using the new TILE macros

Resolves COMPMID-4803,COMPMID-4804

Change-Id: Iac8fead65e21b64567a05dbc4fbaa61d362443f9
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6235
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/nhwc/scale.cl | 345 +++++++++++++++++++++++------------
 src/gpu/cl/kernels/ClScaleKernel.cpp |  89 +++++++--
 2 files changed, 293 insertions(+), 141 deletions(-)

diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 69cbbcd5f3..21579aed9f 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -22,154 +22,257 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "tile_helpers.h"
 
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
  *
  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
+ * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
  *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
+ //! @endcond
 __kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out))
+    TENSOR4D(src, SRC_TENSOR_TYPE),
+    TENSOR4D(dst, DST_TENSOR_TYPE))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _ISCALE_X SCALE_X
+#define _ISCALE_Y SCALE_Y
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
+#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                                        // defined(BATCHED_EXECUTION)
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    float new_x = get_global_id(1) * SCALE_X;
-    float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
+    float xi_f = (xo * (float)SCALE_X);
+    float yi_f = (yo * (float)SCALE_Y);
 #elif SAMPLING_POLICY_CENTER
-    float       new_x = (get_global_id(1) + 0.5f) * SCALE_X;
-    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y;
-#else /* SAMPLING_POLICY */
+    float     xi_f = ((xo + 0.5f) * (float)SCALE_X);
+    float     yi_f = ((yo + 0.5f) * (float)SCALE_Y);
+#else // SAMPLING_POLICY
 #error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
+#endif // SAMPLING_POLICY
+
 #ifdef ALIGN_CORNERS
-    new_x = round(new_x);
-    new_y = round(new_y);
-#endif /* ALIGN_CORNERS */
-    const float clamped_x = clamp(new_x, 0.0f, (float)SRC_WIDTH - 1);
-    const float clamped_y = clamp(new_y, 0.0f, (float)SRC_HEIGHT - 1);
+    xi_f = round(xi_f);
+    yi_f = round(yi_f);
+#endif // ALIGN_CORNERS
+
+    const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1);
+    const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1);
+
+    TILE(SRC_DATA_TYPE, 1, N0, in00);
+
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+
+    TILE(uint, 1, 1, dst_indirect_y);
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
+    // Calculate the destination indirect Y
+    dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
 }
 
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
  *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
+ * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
  *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @note In case of QASYMM8, the following extra information must be passed at compile time:
+ * - The source offset e.g. -DOFFSET=4
+ * - The source scale e.g. -DSCALE=4
  *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
+ //! @endcond
 __kernel void scale_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out))
+    TENSOR4D(src, SRC_TENSOR_TYPE),
+    TENSOR4D(dst, DST_TENSOR_TYPE))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _ISCALE_X SCALE_X
+#define _ISCALE_Y SCALE_Y
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
+#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                        // defined(BATCHED_EXECUTION)
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * SCALE_X;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
+    float xi_f = (xo * (float)SCALE_X);
+    float yi_f = (yo * (float)SCALE_Y);
 #elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * SCALE_X - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y - 0.5f;
-#else /* SAMPLING_POLICY */
+    float     xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f);
+    float     yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f);
+#else // SAMPLING_POLICY
 #error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
+#endif // SAMPLING_POLICY
+
+    const int xi = (int)floor(xi_f);
+    const int yi = (int)floor(yi_f);
 
-    const float new_xf     = floor(new_x);
-    const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, SRC_WIDTH - 1.f);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, SRC_WIDTH - 1.f);
-    const float clamped_y  = clamp(new_yf, 0.0f, SRC_HEIGHT - 1.f);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, SRC_HEIGHT - 1.f);
+    TILE(SRC_DATA_TYPE, 1, N0, in00);
+    TILE(SRC_DATA_TYPE, 1, N0, in01);
+    TILE(SRC_DATA_TYPE, 1, N0, in10);
+    TILE(SRC_DATA_TYPE, 1, N0, in11);
 
-#if defined(OFFSET) && defined(SCALE)
-#define IN_DATA_TYPE int
-#else // defined(OFFSET) && defined(SCALE)
-#define IN_DATA_TYPE float
-#endif // defined(OFFSET) && defined(SCALE)
+    // Initialize the tiles to CONSTANT_VALUE
+    in00[0].v = CONSTANT_VALUE;
+    in01[0].v = CONSTANT_VALUE;
+    in10[0].v = CONSTANT_VALUE;
+    in11[0].v = CONSTANT_VALUE;
 
 #ifndef BORDER_MODE_REPLICATE
-    const bool check_x  = (0.f <= new_xf && new_xf < (float)SRC_WIDTH);
-    const bool check_x1 = (-1.f <= new_xf && new_xf < SRC_WIDTH - 1.f);
-    const bool check_y  = (0.f <= new_yf && new_yf < (float)SRC_HEIGHT);
-    const bool check_y1 = (-1.f <= new_yf && new_yf < SRC_HEIGHT - 1.f);
-
-    const IN_DATA_TYPE ins_0 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                       (get_global_id(2) / DEPTH_OUT)))),
-                                      check_x && check_y);
-    const IN_DATA_TYPE ins_1 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                       (get_global_id(2) / DEPTH_OUT)))),
-                                      check_x1 && check_y);
-    const IN_DATA_TYPE ins_2 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                       (get_global_id(2) / DEPTH_OUT)))),
-                                      check_x && check_y1);
-    const IN_DATA_TYPE ins_3 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                       (get_global_id(2) / DEPTH_OUT)))),
-                                      check_x1 && check_y1);
-    VEC_DATA_TYPE(IN_DATA_TYPE, 4)
-    ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(ins_0, ins_1, ins_2, ins_3);
-#else  /* BORDER_MODE_REPLICATE */
-    VEC_DATA_TYPE(IN_DATA_TYPE, 4)
-    ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                           *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                           *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                                           *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
-    const float a  = new_x - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = new_y - new_yf;
-    const float b1 = 1.f - a1;
-
-#if defined(OFFSET) && defined(SCALE)
-    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
-    const float  fr     = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
-    DATA_TYPE    res    = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
-    *((__global DATA_TYPE *)out.ptr) = res;
-#else  // defined(OFFSET) && defined(SCALE)
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-#endif // defined(OFFSET) && defined(SCALE)
-}
-#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11);
+#else  // BORDER_MODE_REPLICATE
+    const int xi0  = clamp(xi, 0, _ISRC_WIDTH - 1);
+    const int yi0  = clamp(yi, 0, _ISRC_HEIGHT - 1);
+    const int xi1  = clamp(xi + 1, 0, _ISRC_WIDTH - 1);
+    const int yi1  = clamp(yi + 1, 0, _ISRC_HEIGHT - 1);
+
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11);
+#endif // BORDER_MODE_REPLICATE
+
+    TILE(DST_DATA_TYPE, 1, N0, out);
+
+#if defined(IS_FLOATING_POINT)
+    const SRC_DATA_TYPE a  = (SRC_DATA_TYPE)(xi_f - (float)xi);
+    const SRC_DATA_TYPE b  = (SRC_DATA_TYPE)(1.f - a);
+    const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi);
+    const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1);
+
+    // Calculate the output
+    out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
+#else  // defined(IS_FLOATING_POINT)
+    TILE(float, 1, N0, out_f);
+    TILE(float, 1, N0, in00_f);
+    TILE(float, 1, N0, in01_f);
+    TILE(float, 1, N0, in10_f);
+    TILE(float, 1, N0, in11_f);
+
+    const float a  = (xi_f - (float)xi);
+    const float b  = (1.f - a);
+    const float a1 = (yi_f - (float)yi);
+    const float b1 = (1.f - a1);
+
+    // Dequantize
+    LOOP_UNROLLING(int, n0, 0, 1, N0,
+    {
+        in00_f[0].s[n0] = ((float)in00[0].s[n0] - (float)OFFSET) * (float)SCALE;
+        in01_f[0].s[n0] = ((float)in01[0].s[n0] - (float)OFFSET) * (float)SCALE;
+        in10_f[0].s[n0] = ((float)in10[0].s[n0] - (float)OFFSET) * (float)SCALE;
+        in11_f[0].s[n0] = ((float)in11[0].s[n0] - (float)OFFSET) * (float)SCALE;
+    })
+
+    // Calculate the output in the floating-point domain
+    out_f[0].v = ((in00_f[0].v * b * b1) + (in01_f[0].v * a * b1) + (in10_f[0].v * b * a1) + (in11_f[0].v * a * a1));
+
+    // Quantize
+    LOOP_UNROLLING(int, n0, 0, 1, N0,
+    {
+        out[0].s[n0] = CONVERT_SAT(out_f[0].s[n0] / (float)SCALE + (float)OFFSET, DST_DATA_TYPE);
+    })
+#endif // defined(IS_FLOATING_POINT)
+
+    TILE(uint, 1, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
+}
\ No newline at end of file
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 9307f7d4fb..d63c0e1754 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -114,33 +114,82 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     // Create kernel
     const int          idx_width         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int          idx_height        = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int          idx_channel       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const unsigned int src_width         = src->dimension(idx_width);
     const unsigned int src_height        = src->dimension(idx_height);
+    const unsigned int src_channel       = src->dimension(idx_channel);
     const unsigned int dst_width         = dst->dimension(idx_width);
-    const unsigned int vec_size          = adjust_vec_size(is_nhwc ? 1 : 4, dst_width);
-    const unsigned int vec_size_leftover = (dst_width % vec_size);
+    const unsigned int dst_height        = dst->dimension(idx_height);
+    const unsigned int dst_channels      = dst->dimension(idx_channel);
+    unsigned int       vec_size          = 0;
+    unsigned int       vec_size_leftover = 0;
 
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
-    build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
-    build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
-
-    build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
-    build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
-    build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
-    build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
-    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
-    build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-    if(is_qasymm_bilinear)
+    if(_data_layout == DataLayout::NHWC)
     {
-        const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
-        build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+        vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
+        vec_size_leftover = dst_channels % vec_size;
+        build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+        build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel));
+        build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+        build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
+        build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
+        build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
+        build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+        build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+        build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
+        build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+        build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+        build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+        build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+        build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+        build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        if(is_qasymm_bilinear)
+        {
+            const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+            build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
+            build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+        }
+        else
+        {
+            build_opts.add_option("-DSCALE=" + support::cpp11::to_string(1));
+            build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(0));
+        }
     }
+    else if(_data_layout == DataLayout::NCHW)
+    {
+        vec_size          = adjust_vec_size(4, dst_width);
+        vec_size_leftover = dst_width % vec_size;
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+        build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+        build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
+        build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+        build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+        build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        if(is_qasymm_bilinear)
+        {
+            const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+            build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
+            build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON("Unsupported data layout");
+    }
+
     std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name + "_";
-- 
cgit v1.2.1