From ada6cbc057ff725e57d301a99a1816ce602485b9 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Fri, 16 Apr 2021 17:03:39 +0100
Subject: Remove OpenCL padding: CLPixelWiseMultiplicationKernel

- Change kernel's vec_size to 16 / sizeof(output)
- Change ICLKernel.cpp to handle broadcast without padding

Resolve COMPMID-3913

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: I03e884b250ef5784dc109bff8cf2c96b345d119f
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5450
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 src/core/CL/ICLKernel.cpp                          |   4 +-
 src/core/CL/cl_kernels/pixelwise_mul_float.cl      |  37 +++---
 src/core/CL/cl_kernels/pixelwise_mul_int.cl        |  56 +++++----
 src/core/CL/cl_kernels/tile_helpers.h              |  14 +--
 .../cl/kernels/ClPixelWiseMultiplicationKernel.cpp | 129 ++++-----------------
 .../cl/kernels/ClPixelWiseMultiplicationKernel.h   |  12 +-
 .../CL/functions/CLPixelWiseMultiplication.cpp     |   2 +-
 .../gpu/cl/operators/ClPixelWiseMultiplication.cpp |  65 -----------
 .../gpu/cl/operators/ClPixelWiseMultiplication.h   |  12 --
 tests/validation/CL/PixelWiseMultiplication.cpp    |   9 +-
 10 files changed, 83 insertions(+), 257 deletions(-)

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 1c6963f3f1..9ba17d0e03 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -105,8 +105,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
 
     for(unsigned int d = 0; d < dimension_size; ++d)
     {
-        _kernel.setArg<cl_uint>(idx++, strides[d]);
-        _kernel.setArg<cl_uint>(idx++, strides[d] * window[d].step());
+        _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
+        _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
     }
 
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index 845e1c9860..0016775893 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,10 @@
 #include "activation_float_helpers.h"
 #endif // defined(ACTIVATION_TYPE)
 
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+
 /** Performs a pixelwise multiplication with float scale of either integer or float inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -77,31 +81,30 @@ __kernel void pixelwise_mul_float(
     const float scale)
 {
     // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
 
     // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
 
     // Perform multiplication
 #ifdef DATA_TYPE_FLOAT
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
 #else  /* DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(ACC_DATA_TYPE, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
 #endif /* DATA_TYPE_FLOAT */
 
 #if defined(ACTIVATION_TYPE)
-    vstore16(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#else  // defined(ACTIVATION_TYPE)
-    // Store result
-    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
 
@@ -155,7 +158,7 @@ __kernel void pixelwise_mul_complex(
     res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
 
 #if defined(ACTIVATION_TYPE)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
 #else  // defined(ACTIVATION_TYPE)
     // Store result
     vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index b0bd338147..92a7e6f94e 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,10 @@
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 
 #if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
 /** Performs a pixelwise multiplication with integer scale of integer inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -75,27 +79,29 @@ __kernel void pixelwise_mul_int(
     TENSOR3D_DECLARATION(out),
     const uint scale)
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
 
     // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
 
     // Perform multiplication and store result
-    vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
+    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
 
-#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
 
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
 
 /** Performs a pixelwise multiplication with float scale of quantized inputs.
  *
@@ -141,14 +147,17 @@ __kernel void pixelwise_mul_quantized(
     TENSOR3D_DECLARATION(out),
     const float scale)
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
 
     // Load data
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT);
+    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_OUT)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_OUT)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
 
     // Dequantize
 #if defined(OFFSET_IN1)
@@ -165,10 +174,9 @@ __kernel void pixelwise_mul_quantized(
 #else  // defined(OFFSET_OUT)
     const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
 #endif // defined(OFFSET_OUT)
-    const VEC_TYPE res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
 
     // Store result
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 496f2dd664..8b6d5309e3 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -83,18 +83,6 @@
  */
 #define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
 
-/** Offset (in bytes) calculation for a 1D BUFFER (cl_buffer) tensor */
-#define OFFSET1D(base, data_type, x) (base##_offset_first_element_in_bytes + x * sizeof(data_type))
-
-/** Offset (in bytes) calculation for a 2D BUFFER (cl_buffer) tensor */
-#define OFFSET2D(base, data_type, x, y) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y)
-
-/** Offset (in bytes) calculation for a 3D BUFFER (cl_buffer) tensor */
-#define OFFSET3D(base, data_type, x, y, z) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z)
-
-/** Offset (in bytes) calculation for a 4D BUFFER (cl_buffer) tensor */
-#define OFFSET4D(base, data_type, x, y, z, w) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z + w * base##_stride_w)
-
 /** Dot product integet 8bit function
  *
  *  @note Performs: c += dot(a, b)
@@ -184,7 +172,7 @@
         LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \
         { \
             dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
-        }                                                                                                                 \
+        }                                                                                                                  \
     })
 
 /** Load a tile from global memory (tensor) using an indirect Y index tile
diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
index 56997dc8ad..14e45b2e6d 100644
--- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
@@ -42,8 +42,6 @@ namespace kernels
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
 Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
@@ -92,60 +90,6 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    {
-        set_shape_if_empty(*dst, out_shape);
-
-        if(src1->data_type() == DataType::S16 || src2->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*dst, Format::S16);
-        }
-        else if(src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*dst, Format::F32);
-        }
-        else if(src1->data_type() == DataType::QASYMM8)
-        {
-            set_data_type_if_unknown(*dst, DataType::QASYMM8);
-        }
-        else if(src1->data_type() == DataType::QASYMM8_SIGNED)
-        {
-            set_data_type_if_unknown(*dst, DataType::QASYMM8_SIGNED);
-        }
-        else if(src1->data_type() == DataType::QSYMM16)
-        {
-            set_data_type_if_unknown(*dst, DataType::QSYMM16);
-        }
-    }
-
-    Window win        = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*src1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*src2);
-
-    AccessWindowHorizontal input1_access(src1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(src2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-BorderSize calc_border_size(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    const unsigned int replicateSize = dst->dimension(0) - std::min(src1->dimension(0), src2->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-
-    return BorderSize{ 0, border, 0, 0 };
-}
 } // namespace
 
 void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
@@ -155,12 +99,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
                                                   scale, overflow_policy, rounding_policy, act_info));
 
-    // Calculate border size
-    _border_size = calc_border_size(src1, src2, dst);
+    auto padding_info = get_padding_info({ src1, src2, dst });
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
 
     int scale_int = -1;
     // Extract sign, exponent and mantissa
@@ -197,7 +139,9 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
         }
     }
 
-    const bool is_quantized = is_data_type_quantized(src1->data_type());
+    const bool         is_quantized      = is_data_type_quantized(src1->data_type());
+    const unsigned int vec_size          = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
+    const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
 
     // Set kernel build options
     std::string    kernel_name = "pixelwise_mul";
@@ -205,7 +149,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     if(is_quantized && (dst->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
@@ -252,7 +199,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
         _kernel.setArg(idx++, scale);
     }
 
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*dst, Steps(vec_size));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
@@ -260,7 +210,6 @@ Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src1->clone().get(), src2->clone().get(), dst->clone().get()).first);
 
     return Status{};
 }
@@ -312,14 +261,9 @@ void ClPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window
     while(collapsed.slide_window_slice_3D(slice));
 }
 
-BorderSize ClPixelWiseMultiplicationKernel::border_size() const
-{
-    return _border_size;
-}
-
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
+constexpr unsigned int vec_size_complex = 1;
 
 Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
@@ -342,30 +286,6 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
-    auto_init_if_empty(*dst, out_info);
-
-    Window win        = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration_complex));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*src1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*src2);
-
-    AccessWindowHorizontal input1_access(src1, 0, num_elems_processed_per_iteration_complex);
-    AccessWindowHorizontal input2_access(src2, 0, num_elems_processed_per_iteration_complex);
-    AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration_complex);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
@@ -373,12 +293,10 @@ void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &c
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
 
-    // Calculate border size
-    _border_size = calc_border_size(src1, src2, dst);
+    auto padding_info = get_padding_info({ src1, src2, dst });
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_complex(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -392,14 +310,16 @@ void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &c
     // Create kernel
     _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
 
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*dst, Steps(vec_size_complex));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status ClComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(src1->clone().get(), src2->clone().get(), dst->clone().get()).first);
 
     return Status{};
 }
@@ -450,11 +370,6 @@ void ClComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
-
-BorderSize ClComplexPixelWiseMultiplicationKernel::border_size() const
-{
-    return _border_size;
-}
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
index 5889b84938..5b827262a1 100644
--- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
+++ b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
@@ -41,7 +41,7 @@ public:
     /** Default constructor */
     ClPixelWiseMultiplicationKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPixelWiseMultiplicationKernel);
-    /** Initialise the kernel's src, dst and border mode.
+    /** Initialise the kernel's src and dst.
      *
      * Valid configurations (Input1,Input2) -> Output :
      *
@@ -101,10 +101,6 @@ public:
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    BorderSize _border_size{};
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -114,7 +110,7 @@ public:
     /** Default constructor */
     ClComplexPixelWiseMultiplicationKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexPixelWiseMultiplicationKernel);
-    /** Initialise the kernel's src, dst and border mode.
+    /** Initialise the kernel's src and dst.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  src1            An src tensor info. Data types supported: F32. Number of channels supported: 2.
@@ -136,10 +132,6 @@ public:
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    BorderSize _border_size{};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 5ebaf5d122..efebf2b84c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h"
 
 #include <utility>
diff --git a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp
index c4f11a4e29..137a0de6a7 100644
--- a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp
+++ b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp
@@ -24,7 +24,6 @@
 #include "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/gpu/cl/ClCompileContext.h"
 #include "src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h"
 
@@ -32,44 +31,12 @@ namespace arm_compute
 {
 namespace opencl
 {
-namespace
-{
-ITensorPack select_border_input(ITensorPack &tensors)
-{
-    ITensorPack pack;
-    if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
-    {
-        if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
-        {
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
-        }
-        else
-        {
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
-        }
-    }
-    return pack;
-}
-} // namespace
-
 void ClPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
     auto k = std::make_unique<kernels::ClPixelWiseMultiplicationKernel>();
     k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     _kernel = std::move(k);
-
-    if(dst->dimension(0) > 1)
-    {
-        ITensorInfo *broadcasted_info = (src1->dimension(0) == 1) ? src1 : src2;
-
-        if(broadcasted_info->dimension(0) == 1)
-        {
-            auto b = std::make_unique<CLFillBorderKernel>();
-            b->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-            _border_handler = std::move(b);
-        }
-    }
 }
 
 Status ClPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
@@ -78,48 +45,16 @@ Status ClPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITenso
     return kernels::ClPixelWiseMultiplicationKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void ClPixelWiseMultiplication::run(ITensorPack &tensors)
-{
-    if(_border_handler)
-    {
-        auto border_pack = select_border_input(tensors);
-        CLScheduler::get().enqueue_op(*_border_handler, border_pack);
-    }
-    ICLOperator::run(tensors);
-}
-
 void ClComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     auto k = std::make_unique<kernels::ClComplexPixelWiseMultiplicationKernel>();
     k->configure(compile_context, src1, src2, dst, act_info);
     _kernel = std::move(k);
-
-    if(dst->dimension(0) > 1)
-    {
-        ITensorInfo *broadcasted_info = (src1->dimension(0) == 1) ? src1 : src2;
-
-        if(broadcasted_info->dimension(0) == 1)
-        {
-            auto b = std::make_unique<CLFillBorderKernel>();
-            b->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-            _border_handler = std::move(b);
-        }
-    }
 }
 
 Status ClComplexPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     return kernels::ClComplexPixelWiseMultiplicationKernel::validate(src1, src2, dst, act_info);
 }
-
-void ClComplexPixelWiseMultiplication::run(ITensorPack &tensors)
-{
-    if(_border_handler)
-    {
-        auto border_pack = select_border_input(tensors);
-        CLScheduler::get().enqueue_op(*_border_handler, border_pack);
-    }
-    ICLOperator::run(tensors);
-}
 } // namespace opencl
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h
index e9b3e4a5ef..e1598cb870 100644
--- a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h
+++ b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h
@@ -99,12 +99,6 @@ public:
      */
     static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
 };
 
 /** Basic function to run @ref opencl::ClComplexPixelWiseMultiplication. */
@@ -132,12 +126,6 @@ public:
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
     static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index 70e618efa1..f466332e3c 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,27 +91,24 @@ TEST_SUITE(PixelWiseMultiplication)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid scale
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Scale",{  2.f, 2.f, 2.f, -1.f, 1.f, 1.f})),
-               framework::dataset::make("Expected", { true, true, false, false, false, false})),
+               framework::dataset::make("Scale",{  2.f, 2.f, -1.f, 1.f, 1.f})),
+               framework::dataset::make("Expected", { true, true, false, false, false})),
                input1_info, input2_info, output_info, scale, expected)
 {
     bool has_error = bool(CLPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO));
-- 
cgit v1.2.1