From 511771fbe0a74e6d9dfd37ba9b4926a8315ec7aa Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Thu, 19 Aug 2021 13:04:56 +0100
Subject: Remove padding from ClScaleKernel

- Merge quantized kernels with fp for bilinear interpolation (both NCHW and NHWC)
- Pass dimensions at compile time rather than at run time
- Use tile-based approach to rework the NCHW kernels
- Remove unused functions/files

Resolve COMPMID-4723

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ifcdf02beb9daa9f318395751b3c85eb2fe874082
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6138
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
---
 SConscript                                         |   3 -
 src/core/CL/cl_kernels/common/elementwise_unary.cl |   1 -
 src/core/CL/cl_kernels/nchw/scale.cl               | 183 +++++++++++++++++----
 src/core/CL/cl_kernels/nchw/scale_quantized.cl     |  86 ----------
 src/core/CL/cl_kernels/nhwc/scale.cl               | 109 ++++++------
 src/core/CL/cl_kernels/nhwc/scale_quantized.cl     | 124 --------------
 src/core/CL/cl_kernels/warp_helpers.h              |   6 -
 src/core/CL/cl_kernels/warp_helpers_quantized.h    | 136 ---------------
 src/core/gpu/cl/ClKernelLibrary.cpp                |  10 --
 src/core/gpu/cl/kernels/ClScaleKernel.cpp          | 130 ++++-----------
 src/core/gpu/cl/kernels/ClScaleKernel.h            |   1 -
 src/runtime/gpu/cl/operators/ClScale.cpp           |  11 +-
 src/runtime/gpu/cl/operators/ClScale.h             |   4 -
 tests/validation/CL/Scale.cpp                      |  12 +-
 14 files changed, 246 insertions(+), 570 deletions(-)
 delete mode 100644 src/core/CL/cl_kernels/nchw/scale_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/nhwc/scale_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/warp_helpers_quantized.h

diff --git a/SConscript b/SConscript
index 2eed75a1a5..c384597752 100644
--- a/SConscript
+++ b/SConscript
@@ -241,7 +241,6 @@ if env['opencl'] and env['embed_kernels']:
                         'src/core/CL/cl_kernels/repeat.h',
                         'src/core/CL/cl_kernels/tile_helpers.h',
                         'src/core/CL/cl_kernels/types.h',
-                        'src/core/CL/cl_kernels/warp_helpers_quantized.h',
                         'src/core/CL/cl_kernels/warp_helpers.h'
                     ]
 
@@ -328,7 +327,6 @@ if env['opencl'] and env['embed_kernels']:
                     'src/core/CL/cl_kernels/nchw/remap.cl',
                     'src/core/CL/cl_kernels/nchw/reorg_layer.cl',
                     'src/core/CL/cl_kernels/nchw/scale.cl',
-                    'src/core/CL/cl_kernels/nchw/scale_quantized.cl',
                     'src/core/CL/cl_kernels/nchw/space_to_batch.cl',
                     'src/core/CL/cl_kernels/nchw/space_to_depth.cl',
                     'src/core/CL/cl_kernels/nchw/upsample_layer.cl',
@@ -355,7 +353,6 @@ if env['opencl'] and env['embed_kernels']:
                     'src/core/CL/cl_kernels/nhwc/remap.cl',
                     'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
                     'src/core/CL/cl_kernels/nhwc/scale.cl',
-                    'src/core/CL/cl_kernels/nhwc/scale_quantized.cl',
                     'src/core/CL/cl_kernels/nhwc/space_to_batch.cl',
                     'src/core/CL/cl_kernels/nhwc/space_to_depth.cl',
                     'src/core/CL/cl_kernels/nhwc/upsample_layer.cl',
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
index d2d9d97d33..eba2dbc866 100644
--- a/src/core/CL/cl_kernels/common/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-#include "warp_helpers.h"
 
 #if defined(DATA_TYPE) && defined(OPERATION)
 
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
index 63a53cc4f2..2b4d6be9fb 100644
--- a/src/core/CL/cl_kernels/nchw/scale.cl
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-#include "warp_helpers.h"
+#include "tile_helpers.h"
 
 /** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
  *
@@ -87,28 +87,55 @@ inline const float8 transform_bilinear(const float2 coord, const float2 scale)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_nearest_neighbour_nchw(
     IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
+    IMAGE_DECLARATION(out))
 {
-    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r           = (float2)(scale_x, scale_y);
-    float8       transformed = transform_nearest(get_current_coords(), r);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    float8 transformed = transform_nearest((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
 #ifdef ALIGN_CORNERS
     transformed = round(transformed);
 #endif // ALIGN_CORNERS
-    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
-    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+
+    TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 4, cond);
+    cond[0].v = CONVERT(((transformed.even < 0) || (transformed.even >= (int)SRC_WIDTH)) || ((transformed.odd < 0) || (transformed.odd >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    TILE(int, 1, 4, in_x);
+    TILE(int, 1, 4, in_y);
+    in_x[0].v = convert_int4(clamp(transformed.even, 0.f, SRC_WIDTH - 1.f));
+    in_y[0].v = convert_int4(clamp(transformed.odd, 0.f, SRC_HEIGHT - 1.f));
+
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+    })
+
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+    if(x == get_global_size(0) - 1)
+    {
+#if VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+    else
+    {
+#if VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
 }
 
 /** Performs an affine transformation on an image interpolating with the BILINEAR method.
@@ -127,22 +154,118 @@ __kernel void scale_nearest_neighbour_nchw(
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_bilinear_nchw(
     IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
+    IMAGE_DECLARATION(out))
 {
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear(get_current_coords(), r);
-    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    TILE(float, 1, 8, trans_coords);
+    TILE(float, 1, 8, floor_coords);
+    TILE(int, 1, 16, in_x);
+    TILE(int, 1, 16, in_y);
+
+    trans_coords[0].v = transform_bilinear((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
+    floor_coords[0].v = floor(trans_coords[0].v);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, 4,
+        {
+            in_x[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 0] + (j % 2);
+            in_y[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 1] + (j > 1);
+        })
+    })
+
+#if defined(BORDER_MODE_CONSTANT)
+    TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 16, cond);
+    cond[0].v = CONVERT(((in_x[0].v < 0) || (in_x[0].v >= (int)SRC_WIDTH)) || ((in_y[0].v < 0) || (in_y[0].v >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif // defined(BORDER_MODE_CONSTANT)
+
+    in_x[0].v = clamp(in_x[0].v, 0, (int16)((int)SRC_WIDTH - 1));
+    in_y[0].v = clamp(in_y[0].v, 0, (int16)((int)SRC_HEIGHT - 1));
+
+    TILE(DATA_TYPE, 1, 16, in_vals);
+
+    // Loads the values from the input image
+#if defined(BORDER_MODE_CONSTANT)
+    LOOP_UNROLLING(int, i, 0, 1, 16,
+    {
+        in_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+    })
+#else  // defined(BORDER_MODE_CONSTANT)
+    LOOP_UNROLLING(int, i, 0, 1, 16,
+    {
+        in_vals[0].s[i] = *((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y));
+    })
+#endif // defined(BORDER_MODE_CONSTANT)
+
+    TILE(float, 1, 8, a);
+    TILE(float, 1, 8, b);
+
+    a[0].v = trans_coords[0].v - floor_coords[0].v;
+    b[0].v = ((float8)(1.f)) - a[0].v;
+
+#if defined(OFFSET) && defined(SCALE)
+    TILE(float, 1, 16, in_vals_f32);
+    TILE(float, 1, 4, out_vals_f32);
+
+    in_vals_f32[0].v = convert_float16(convert_int16(in_vals[0].v) - (int16)OFFSET) * (float16)SCALE;
+
+    // Bilinear interpolation: (in0  * b0 * b1) + (in1  * a0 * b1) + (in2  * b0 * a1) + (in3  * a0 * a1)
+    //                         (in4  * b2 * b3) + (in5  * a2 * b3) + (in6  * b2 * a3) + (in7  * a2 * a3)
+    //                         (in8  * b4 * b5) + (in9  * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+    //                         (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        out_vals_f32[0].s[i] = (in_vals_f32[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+    })
+
+    TILE(DATA_TYPE, 1, 4, out_vals_4);
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+    out_vals_4[0].v = CONVERT_SAT(convert_int4_sat_rtp(out_vals_f32[0].v / (float)SCALE) + OFFSET, VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = out_vals_4[0].s[i];
+    })
+#else  // defined(OFFSET) && defined(SCALE)
+
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+    // Bilinear interpolation: (in0  * b0 * b1) + (in1  * a0 * b1) + (in2  * b0 * a1) + (in3  * a0 * a1)
+    //                         (in4  * b2 * b3) + (in5  * a2 * b3) + (in6  * b2 * a3) + (in7  * a2 * a3)
+    //                         (in8  * b4 * b5) + (in9  * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+    //                         (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = (in_vals[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+    })
+#endif // defined(OFFSET) && defined(SCALE)
+
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+    if(x == get_global_size(0) - 1)
+    {
+#if VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+    else
+    {
+#if VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
 }
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
deleted file mode 100644
index 946ad65c14..0000000000
--- a/src/core/CL/cl_kernels/nchw/scale_quantized.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
-    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 1ea5e73df1..69cbbcd5f3 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-#include "warp_helpers.h"
 
 #if defined(DEPTH_OUT)
 /** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
@@ -46,28 +45,20 @@
  * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
  * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_nearest_neighbour_nhwc(
     TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
+    TENSOR4D_DECLARATION(out))
 {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    float new_x = get_global_id(1) * scale_x;
-    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+    float new_x = get_global_id(1) * SCALE_X;
+    float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
 #elif SAMPLING_POLICY_CENTER
-    float       new_x = (get_global_id(1) + 0.5f) * scale_x;
-    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+    float       new_x = (get_global_id(1) + 0.5f) * SCALE_X;
+    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
@@ -75,8 +66,8 @@ __kernel void scale_nearest_neighbour_nhwc(
     new_x = round(new_x);
     new_y = round(new_y);
 #endif /* ALIGN_CORNERS */
-    const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
-    const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
+    const float clamped_x = clamp(new_x, 0.0f, (float)SRC_WIDTH - 1);
+    const float clamped_y = clamp(new_y, 0.0f, (float)SRC_HEIGHT - 1);
 
     *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
 }
@@ -104,71 +95,81 @@ __kernel void scale_nearest_neighbour_nhwc(
  * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
  * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
  *
  */
 __kernel void scale_bilinear_nhwc(
     TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
+    TENSOR4D_DECLARATION(out))
 {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+    const float new_x = get_global_id(1) * SCALE_X;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
 #elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
+    const float new_x = (get_global_id(1) + 0.5f) * SCALE_X - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y - 0.5f;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
 
     const float new_xf     = floor(new_x);
     const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
+    const float clamped_x  = clamp(new_xf, 0.0f, SRC_WIDTH - 1.f);
+    const float clamped_x1 = clamp(new_xf + 1, 0.0f, SRC_WIDTH - 1.f);
+    const float clamped_y  = clamp(new_yf, 0.0f, SRC_HEIGHT - 1.f);
+    const float clamped_y1 = clamp(new_yf + 1, 0.0f, SRC_HEIGHT - 1.f);
+
+#if defined(OFFSET) && defined(SCALE)
+#define IN_DATA_TYPE int
+#else // defined(OFFSET) && defined(SCALE)
+#define IN_DATA_TYPE float
+#endif // defined(OFFSET) && defined(SCALE)
 
 #ifndef BORDER_MODE_REPLICATE
-    const bool  check_x  = (0.f <= new_xf && new_xf < input_width);
-    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y  = (0.f <= new_yf && new_yf < input_height);
-    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const float ins_0    = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                           (get_global_id(2) / DEPTH_OUT)))),
-                                  check_x && check_y);
-    const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                               check_x1 && check_y);
-    const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                               check_x && check_y1);
-    const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                               check_x1 && check_y1);
-    float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
+    const bool check_x  = (0.f <= new_xf && new_xf < (float)SRC_WIDTH);
+    const bool check_x1 = (-1.f <= new_xf && new_xf < SRC_WIDTH - 1.f);
+    const bool check_y  = (0.f <= new_yf && new_yf < (float)SRC_HEIGHT);
+    const bool check_y1 = (-1.f <= new_yf && new_yf < SRC_HEIGHT - 1.f);
+
+    const IN_DATA_TYPE ins_0 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                       (get_global_id(2) / DEPTH_OUT)))),
+                                      check_x && check_y);
+    const IN_DATA_TYPE ins_1 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
+                                                                                       (get_global_id(2) / DEPTH_OUT)))),
+                                      check_x1 && check_y);
+    const IN_DATA_TYPE ins_2 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
+                                                                                       (get_global_id(2) / DEPTH_OUT)))),
+                                      check_x && check_y1);
+    const IN_DATA_TYPE ins_3 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
+                                                                                       (get_global_id(2) / DEPTH_OUT)))),
+                                      check_x1 && check_y1);
+    VEC_DATA_TYPE(IN_DATA_TYPE, 4)
+    ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(ins_0, ins_1, ins_2, ins_3);
 #else  /* BORDER_MODE_REPLICATE */
-    float4 ins        = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+    VEC_DATA_TYPE(IN_DATA_TYPE, 4)
+    ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                                           *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                                           *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                                           *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
 #endif /* BORDER_MODE_REPLICATE */
 
     const float a  = new_x - new_xf;
     const float b  = 1.f - a;
     const float a1 = new_y - new_yf;
     const float b1 = 1.f - a1;
+
+#if defined(OFFSET) && defined(SCALE)
+    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
+    const float  fr     = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
+    DATA_TYPE    res    = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
+
+    *((__global DATA_TYPE *)out.ptr) = res;
+#else  // defined(OFFSET) && defined(SCALE)
     const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
 
     *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
+#endif // defined(OFFSET) && defined(SCALE)
 }
 #endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
deleted file mode 100644
index de9bb607b0..0000000000
--- a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- * @param[in]  constant_border_value             Constant border value to use
- */
-__kernel void scale_bilinear_quantized_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf     = floor(new_x);
-    const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    const bool check_x  = (0.f <= new_xf && new_xf < input_width);
-    const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool check_y  = (0.f <= new_yf && new_yf < input_height);
-    const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const int ins_0     = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                      (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y);
-    const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                             check_x1 && check_y);
-    const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                             check_x && check_y1);
-    const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                             check_x1 && check_y1);
-    int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
-#else  /* BORDER_MODE_REPLICATE */
-    int4 ins          = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
-    const float  a      = new_x - new_xf;
-    const float  b      = 1.f - a;
-    const float  a1     = new_y - new_yf;
-    const float  b1     = 1.f - a1;
-    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
-
-    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
-
-    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
-    *((__global DATA_TYPE *)out.ptr) = res;
-}
-#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 005861ddfa..642483ab3c 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -63,12 +63,6 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
                                          *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
 }
 
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords()
-{
-    return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
 /** Given a texel coordinates this function will return the following array of coordinates:
  * [ P, right neighbour, below neighbour, below right neighbour ]
  *
diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
deleted file mode 100644
index b10890aff0..0000000000
--- a/src/core/CL/cl_kernels/warp_helpers_quantized.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-/** Clamps the given coordinates to the borders according to the border size.
- *
- * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width       Width of the image
- * @param[in] height      Height of the image
- * @param[in] border_size Border size of the image
- *
- */
-inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
-{
-    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
-    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
-}
-
-/** Clamps the given coordinates to the borders.
- *
- * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width  Width of the image
- * @param[in] height Height of the image
- *
- */
-inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
-{
-    return clamp_to_border_with_size_quantized(coords, width, height, 1);
-}
-
-/** Given a texel coordinates this function will return the following array of coordinates:
- * [ P, right neighbour, below neighbour, below right neighbour ]
- *
- * @note No checks to see if the coordinates are out of the image are done here.
- *
- * @param[in] coord Input coordinates
- *
- * @return vector of 8 floats with the coordinates, even positions are x and odd y.
- */
-inline const float8 get_neighbour_coords_quantized(const float2 coord)
-{
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
-}
-
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords_quantized()
-{
-    return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in            Pointer to the source image.
- * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width         Width of the image
- * @param[in] height        Height of the image
- * @param[in] border_size   Border size
- * @param[in] scale         Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
-                                                                                    const float scale, const int offset_qasymm)
-{
-    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
-
-    // Sets the 4x4 coordinates for each of the four input texels
-    const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
-
-    // Loads the values from the input image
-    const int16 t = (int16)(
-                        /* tl, tr, bl, br */
-                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-
-    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
-
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
-                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
-                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
-                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
-
-    const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
-
-    return res;
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in            Pointer to the source image.
- * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width         Width of the image
- * @param[in] height        Height of the image
- * @param[in] scale         Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
-{
-    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
-}
diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp
index 05beb5f63d..4a9ba874b1 100644
--- a/src/core/gpu/cl/ClKernelLibrary.cpp
+++ b/src/core/gpu/cl/ClKernelLibrary.cpp
@@ -394,7 +394,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
     { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
     { "scale_bilinear_nchw", "nchw/scale.cl" },
-    { "scale_bilinear_quantized_nchw", "nchw/scale_quantized.cl" },
     { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
     { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
     { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
@@ -455,7 +454,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
     { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
     { "scale_bilinear_nhwc", "nhwc/scale.cl" },
-    { "scale_bilinear_quantized_nhwc", "nhwc/scale_quantized.cl" },
     { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
     { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
     { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
@@ -825,10 +823,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
     {
         "nchw/scale.cl",
 #include "./cl_kernels/nchw/scale.clembed"
-    },
-    {
-        "nchw/scale_quantized.cl",
-#include "./cl_kernels/nchw/scale_quantized.clembed"
     },
     {
         "nchw/space_to_batch.cl",
@@ -924,10 +918,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
     {
         "nhwc/scale.cl",
 #include "./cl_kernels/nhwc/scale.clembed"
-    },
-    {
-        "nhwc/scale_quantized.cl",
-#include "./cl_kernels/nhwc/scale_quantized.clembed"
     },
     {
         "nhwc/space_to_batch.cl",
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
index 57ca331539..ee4ee22aa0 100644
--- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
@@ -50,10 +50,10 @@ inline std::pair<float, float> calculate_scale_factors(const ITensorInfo *src, c
     const unsigned int dst_width  = dst->dimension(idx_width);
     const unsigned int dst_height = dst->dimension(idx_height);
 
-    float wr = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
-    float hr = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
+    float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
+    float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
 
-    return std::make_pair(wr, hr);
+    return std::make_pair(scale_x, scale_y);
 }
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
@@ -65,78 +65,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
     ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
 
-    float            wr          = 0.f;
-    float            hr          = 0.f;
+    float            scale_x     = 0.f;
+    float            scale_y     = 0.f;
     const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(wr, hr) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info, BorderSize &border)
-{
-    Window           win{};
-    bool             window_changed{};
-    unsigned int     num_elems_processed_per_iteration = 0;
-    const DataLayout data_layout                       = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            if(info.border_mode == BorderMode::UNDEFINED)
-            {
-                border = BorderSize(0);
-            }
-
-            num_elems_processed_per_iteration = 4;
-            // Configure kernel window
-            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-            AccessWindowStatic input_access(src,
-                                            -border.left, -border.top,
-                                            src->dimension(0) + border.right,
-                                            src->dimension(1) + border.bottom);
-            AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration);
-
-            output_access.set_valid_region(win, calculate_valid_region_scale(*src,
-                                                                             dst->tensor_shape(),
-                                                                             info.interpolation_policy,
-                                                                             info.sampling_policy,
-                                                                             info.border_mode == BorderMode::UNDEFINED));
-
-            window_changed = update_window_and_padding(win, input_access, output_access);
-        }
-        break;
-        case DataLayout::NHWC:
-        {
-            // Configure kernel window
-            win = calculate_max_window(*dst, Steps());
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
-BorderSize ClScaleKernel::border_size() const
-{
-    return BorderSize(static_cast<size_t>(_data_layout == DataLayout::NCHW));
-}
-
 Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info));
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    BorderSize       border      = BorderSize(static_cast<size_t>(data_layout == DataLayout::NCHW));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), info, border).first);
-
     return Status{};
 }
 
@@ -153,37 +97,45 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     // Info required for the static tuning
     _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
 
-    float wr = 0.f;
-    float hr = 0.f;
-    std::tie(wr, hr) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
-    const bool call_quantized_kernel = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-
-    // Compute actual border size
-    BorderSize border  = border_size();
     const bool is_nhwc = _data_layout == DataLayout::NHWC;
 
+    float scale_x = 0.f;
+    float scale_y = 0.f;
+    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
+    const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
+
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
     {
         interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, info, border);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
     // Create kernel
+    const int          idx_width         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height        = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int src_width         = src->dimension(idx_width);
+    const unsigned int src_height        = src->dimension(idx_height);
+    const unsigned int dst_width         = dst->dimension(idx_width);
+    const unsigned int vec_size          = adjust_vec_size(is_nhwc ? 1 : 4, dst_width);
+    const unsigned int vec_size_leftover = (dst_width % vec_size);
+
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
-    build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+    build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+    build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+
     build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+    build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+    build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
     build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
     build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-    if(call_quantized_kernel)
+    if(is_qasymm_bilinear)
     {
         const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
         build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
@@ -191,26 +143,16 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     }
     std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "scale_" + interpolation_name;
-    kernel_name += call_quantized_kernel ? "_quantized_" : "_";
+    std::string kernel_name = "scale_" + interpolation_name + "_";
     kernel_name += lower_string(string_from_data_layout(_data_layout));
 
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-    if(is_nhwc)
-    {
-        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-    }
 
-    const int          idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int          idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    unsigned int       idx        = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    const unsigned int src_width  = src->dimension(idx_width);
-    const unsigned int dst_height = src->dimension(idx_height);
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(vec_size));
+    ICLKernel::configure_internal(win);
 
-    _kernel.setArg<float>(idx++, src_width);
-    _kernel.setArg<float>(idx++, dst_height);
-    _kernel.setArg<float>(idx++, wr);
-    _kernel.setArg<float>(idx++, hr);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Set config_id for enabling LWS tuning
     _config_id = "scale_";
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h
index 8333c7d6c0..6674931296 100644
--- a/src/core/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/core/gpu/cl/kernels/ClScaleKernel.h
@@ -59,7 +59,6 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
-    BorderSize border_size() const override;
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
diff --git a/src/runtime/gpu/cl/operators/ClScale.cpp b/src/runtime/gpu/cl/operators/ClScale.cpp
index 4730c8a16e..5c8d754c7e 100644
--- a/src/runtime/gpu/cl/operators/ClScale.cpp
+++ b/src/runtime/gpu/cl/operators/ClScale.cpp
@@ -41,12 +41,7 @@ void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *sr
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, dst, info);
     _kernel = std::move(k);
-    if(!_kernel->border_size().empty())
-    {
-        auto b = std::make_unique<CLFillBorderKernel>();
-        b->configure(compile_context, src, _kernel->border_size(), info.border_mode, info.constant_border_value);
-        _border_handler = std::move(b);
-    }
+
     // Tune kernel
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
@@ -59,10 +54,6 @@ Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const S
 void ClScale::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    if(!_kernel->border_size().empty())
-    {
-        CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
-    }
     CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
 }
 } // namespace opencl
diff --git a/src/runtime/gpu/cl/operators/ClScale.h b/src/runtime/gpu/cl/operators/ClScale.h
index 905c43a41c..0ff78640f7 100644
--- a/src/runtime/gpu/cl/operators/ClScale.h
+++ b/src/runtime/gpu/cl/operators/ClScale.h
@@ -34,7 +34,6 @@ namespace opencl
 {
 /** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
  *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
  * -# @ref kernels::ClScaleKernel
  */
 class ClScale : public IClOperator
@@ -61,9 +60,6 @@ public:
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
-
-protected:
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index 2b34f1f353..845408a0b2 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -186,16 +186,6 @@ TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
-TEST_CASE(WindowShrink, framework::DatasetMode::ALL)
-{
-    const auto input  = TensorInfo{ TensorShape(37U, 37U, 2U), 1, DataType::F32 };
-    const auto output = TensorInfo{ TensorShape(39U, 55U, 2U), 1, DataType::F32 };
-    Status     result{};
-
-    result = CLScale::validate(&input.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false), ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-}
-
 TEST_CASE(IncorrectScaleFactor, framework::DatasetMode::ALL)
 {
     const auto     input                = TensorInfo{ TensorShape(28U, 33U, 2U), 1, DataType::F32 };
-- 
cgit v1.2.1