aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2021-04-16 17:03:39 +0100
committerGiorgio Arena <giorgio.arena@arm.com>2021-04-20 09:26:59 +0000
commitada6cbc057ff725e57d301a99a1816ce602485b9 (patch)
treef869994cb2b061de0bc4731d720336413b81d32a
parent031d6a97de79fc3ca3eb6fca1611f03aa9b5893b (diff)
downloadComputeLibrary-ada6cbc057ff725e57d301a99a1816ce602485b9.tar.gz
Remove OpenCL padding: CLPixelWiseMultiplicationKernel
- Change kernel's vec_size to 16 / sizeof(output) - Change ICLKernel.cpp to handle broadcast without padding Resolve COMPMID-3913 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: I03e884b250ef5784dc109bff8cf2c96b345d119f Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5450 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--src/core/CL/ICLKernel.cpp4
-rw-r--r--src/core/CL/cl_kernels/pixelwise_mul_float.cl37
-rw-r--r--src/core/CL/cl_kernels/pixelwise_mul_int.cl56
-rw-r--r--src/core/CL/cl_kernels/tile_helpers.h14
-rw-r--r--src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp129
-rw-r--r--src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h12
-rw-r--r--src/runtime/CL/functions/CLPixelWiseMultiplication.cpp2
-rw-r--r--src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp65
-rw-r--r--src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h12
-rw-r--r--tests/validation/CL/PixelWiseMultiplication.cpp9
10 files changed, 83 insertions, 257 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 1c6963f3f1..9ba17d0e03 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -105,8 +105,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
for(unsigned int d = 0; d < dimension_size; ++d)
{
- _kernel.setArg<cl_uint>(idx++, strides[d]);
- _kernel.setArg<cl_uint>(idx++, strides[d] * window[d].step());
+ _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
+ _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
}
_kernel.setArg<cl_uint>(idx++, offset_first_element);
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index 845e1c9860..0016775893 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,6 +36,10 @@
#include "activation_float_helpers.h"
#endif // defined(ACTIVATION_TYPE)
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+
/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
*
* @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -77,31 +81,30 @@ __kernel void pixelwise_mul_float(
const float scale)
{
// Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+ size_t y = get_global_id(1);
+ size_t z = get_global_id(2);
+
+ __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+ __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+ __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
// Load data
- VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
- VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+ VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
+ VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
// Perform multiplication
#ifdef DATA_TYPE_FLOAT
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- res = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+ VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
#else /* DATA_TYPE_FLOAT */
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(ACC_DATA_TYPE, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+ VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
#endif /* DATA_TYPE_FLOAT */
#if defined(ACTIVATION_TYPE)
- vstore16(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#else // defined(ACTIVATION_TYPE)
- // Store result
- vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+ res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
#endif // defined(ACTIVATION_TYPE)
+
+ STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
@@ -155,7 +158,7 @@ __kernel void pixelwise_mul_complex(
res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
#if defined(ACTIVATION_TYPE)
- vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
+ vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
#else // defined(ACTIVATION_TYPE)
// Store result
vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index b0bd338147..92a7e6f94e 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,6 +36,10 @@
#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
/** Performs a pixelwise multiplication with integer scale of integer inputs.
*
* @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -75,27 +79,29 @@ __kernel void pixelwise_mul_int(
TENSOR3D_DECLARATION(out),
const uint scale)
{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+ size_t y = get_global_id(1);
+ size_t z = get_global_id(2);
+
+ __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+ __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+ __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
// Load data
- VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
- in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
- VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
- in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+ VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
+ VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
// Perform multiplication and store result
- vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
+ STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
-#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
/** Performs a pixelwise multiplication with float scale of quantized inputs.
*
@@ -141,14 +147,17 @@ __kernel void pixelwise_mul_quantized(
TENSOR3D_DECLARATION(out),
const float scale)
{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+ size_t y = get_global_id(1);
+ size_t z = get_global_id(2);
+
+ __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+ __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+ __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
// Load data
- VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT);
- VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT);
+ VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_OUT)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+ VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_OUT)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
// Dequantize
#if defined(OFFSET_IN1)
@@ -165,10 +174,9 @@ __kernel void pixelwise_mul_quantized(
#else // defined(OFFSET_OUT)
const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
#endif // defined(OFFSET_OUT)
- const VEC_TYPE res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+ const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
// Store result
- VSTORE(VEC_SIZE)
- (res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+ STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
-#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 496f2dd664..8b6d5309e3 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -83,18 +83,6 @@
*/
#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
-/** Offset (in bytes) calculation for a 1D BUFFER (cl_buffer) tensor */
-#define OFFSET1D(base, data_type, x) (base##_offset_first_element_in_bytes + x * sizeof(data_type))
-
-/** Offset (in bytes) calculation for a 2D BUFFER (cl_buffer) tensor */
-#define OFFSET2D(base, data_type, x, y) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y)
-
-/** Offset (in bytes) calculation for a 3D BUFFER (cl_buffer) tensor */
-#define OFFSET3D(base, data_type, x, y, z) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z)
-
-/** Offset (in bytes) calculation for a 4D BUFFER (cl_buffer) tensor */
-#define OFFSET4D(base, data_type, x, y, z, w) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z + w * base##_stride_w)
-
/** Dot product integet 8bit function
*
* @note Performs: c += dot(a, b)
@@ -184,7 +172,7 @@
LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \
{ \
dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
- } \
+ } \
})
/** Load a tile from global memory (tensor) using an indirect Y index tile
diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
index 56997dc8ad..14e45b2e6d 100644
--- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
@@ -42,8 +42,6 @@ namespace kernels
{
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
{
@@ -92,60 +90,6 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- // Auto initialize dst if not initialized
- {
- set_shape_if_empty(*dst, out_shape);
-
- if(src1->data_type() == DataType::S16 || src2->data_type() == DataType::S16)
- {
- set_format_if_unknown(*dst, Format::S16);
- }
- else if(src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*dst, Format::F32);
- }
- else if(src1->data_type() == DataType::QASYMM8)
- {
- set_data_type_if_unknown(*dst, DataType::QASYMM8);
- }
- else if(src1->data_type() == DataType::QASYMM8_SIGNED)
- {
- set_data_type_if_unknown(*dst, DataType::QASYMM8_SIGNED);
- }
- else if(src1->data_type() == DataType::QSYMM16)
- {
- set_data_type_if_unknown(*dst, DataType::QSYMM16);
- }
- }
-
- Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*src1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*src2);
-
- AccessWindowHorizontal input1_access(src1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(src2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-BorderSize calc_border_size(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- const unsigned int replicateSize = dst->dimension(0) - std::min(src1->dimension(0), src2->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-
- return BorderSize{ 0, border, 0, 0 };
-}
} // namespace
void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
@@ -155,12 +99,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
scale, overflow_policy, rounding_policy, act_info));
- // Calculate border size
- _border_size = calc_border_size(src1, src2, dst);
+ auto padding_info = get_padding_info({ src1, src2, dst });
- // Configure kernel window
- auto win_config = validate_and_configure_window(src1, src2, dst);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
int scale_int = -1;
// Extract sign, exponent and mantissa
@@ -197,7 +139,9 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
}
}
- const bool is_quantized = is_data_type_quantized(src1->data_type());
+ const bool is_quantized = is_data_type_quantized(src1->data_type());
+ const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
+ const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
// Set kernel build options
std::string kernel_name = "pixelwise_mul";
@@ -205,7 +149,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+ build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+ build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
if(is_quantized && (dst->data_type() != DataType::S32))
{
const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
@@ -252,7 +199,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
_kernel.setArg(idx++, scale);
}
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*dst, Steps(vec_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
@@ -260,7 +210,6 @@ Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src1->clone().get(), src2->clone().get(), dst->clone().get()).first);
return Status{};
}
@@ -312,14 +261,9 @@ void ClPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window
while(collapsed.slide_window_slice_3D(slice));
}
-BorderSize ClPixelWiseMultiplicationKernel::border_size() const
-{
- return _border_size;
-}
-
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
+constexpr unsigned int vec_size_complex = 1;
Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
@@ -342,30 +286,6 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- // Auto initialize dst if not initialized
- const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
- auto_init_if_empty(*dst, out_info);
-
- Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration_complex));
- Window win_input1 = win.broadcast_if_dimension_le_one(*src1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*src2);
-
- AccessWindowHorizontal input1_access(src1, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal input2_access(src2, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration_complex);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
@@ -373,12 +293,10 @@ void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &c
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
- // Calculate border size
- _border_size = calc_border_size(src1, src2, dst);
+ auto padding_info = get_padding_info({ src1, src2, dst });
- // Configure kernel window
- auto win_config = validate_and_configure_window_complex(src1, src2, dst);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -392,14 +310,16 @@ void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &c
// Create kernel
_kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*dst, Steps(vec_size_complex));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status ClComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(src1->clone().get(), src2->clone().get(), dst->clone().get()).first);
return Status{};
}
@@ -450,11 +370,6 @@ void ClComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const
}
while(collapsed.slide_window_slice_3D(slice));
}
-
-BorderSize ClComplexPixelWiseMultiplicationKernel::border_size() const
-{
- return _border_size;
-}
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
index 5889b84938..5b827262a1 100644
--- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
+++ b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
@@ -41,7 +41,7 @@ public:
/** Default constructor */
ClPixelWiseMultiplicationKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPixelWiseMultiplicationKernel);
- /** Initialise the kernel's src, dst and border mode.
+ /** Initialise the kernel's src and dst.
*
* Valid configurations (Input1,Input2) -> Output :
*
@@ -101,10 +101,6 @@ public:
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-public:
- BorderSize _border_size{};
};
/** Interface for the complex pixelwise multiplication kernel. */
@@ -114,7 +110,7 @@ public:
/** Default constructor */
ClComplexPixelWiseMultiplicationKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexPixelWiseMultiplicationKernel);
- /** Initialise the kernel's src, dst and border mode.
+ /** Initialise the kernel's src and dst.
*
* @param[in] compile_context The compile context to be used.
* @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2.
@@ -136,10 +132,6 @@ public:
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-public:
- BorderSize _border_size{};
};
} // namespace kernels
} // namespace opencl
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 5ebaf5d122..efebf2b84c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/ICLKernel.h"
#include "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h"
#include <utility>
diff --git a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp
index c4f11a4e29..137a0de6a7 100644
--- a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp
+++ b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp
@@ -24,7 +24,6 @@
#include "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
#include "src/core/gpu/cl/ClCompileContext.h"
#include "src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h"
@@ -32,44 +31,12 @@ namespace arm_compute
{
namespace opencl
{
-namespace
-{
-ITensorPack select_border_input(ITensorPack &tensors)
-{
- ITensorPack pack;
- if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
- {
- if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
- {
- pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
- }
- else
- {
- pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
- }
- }
- return pack;
-}
-} // namespace
-
void ClPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
{
auto k = std::make_unique<kernels::ClPixelWiseMultiplicationKernel>();
k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
_kernel = std::move(k);
-
- if(dst->dimension(0) > 1)
- {
- ITensorInfo *broadcasted_info = (src1->dimension(0) == 1) ? src1 : src2;
-
- if(broadcasted_info->dimension(0) == 1)
- {
- auto b = std::make_unique<CLFillBorderKernel>();
- b->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- _border_handler = std::move(b);
- }
- }
}
Status ClPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
@@ -78,48 +45,16 @@ Status ClPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITenso
return kernels::ClPixelWiseMultiplicationKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
}
-void ClPixelWiseMultiplication::run(ITensorPack &tensors)
-{
- if(_border_handler)
- {
- auto border_pack = select_border_input(tensors);
- CLScheduler::get().enqueue_op(*_border_handler, border_pack);
- }
- ICLOperator::run(tensors);
-}
-
void ClComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
auto k = std::make_unique<kernels::ClComplexPixelWiseMultiplicationKernel>();
k->configure(compile_context, src1, src2, dst, act_info);
_kernel = std::move(k);
-
- if(dst->dimension(0) > 1)
- {
- ITensorInfo *broadcasted_info = (src1->dimension(0) == 1) ? src1 : src2;
-
- if(broadcasted_info->dimension(0) == 1)
- {
- auto b = std::make_unique<CLFillBorderKernel>();
- b->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- _border_handler = std::move(b);
- }
- }
}
Status ClComplexPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
return kernels::ClComplexPixelWiseMultiplicationKernel::validate(src1, src2, dst, act_info);
}
-
-void ClComplexPixelWiseMultiplication::run(ITensorPack &tensors)
-{
- if(_border_handler)
- {
- auto border_pack = select_border_input(tensors);
- CLScheduler::get().enqueue_op(*_border_handler, border_pack);
- }
- ICLOperator::run(tensors);
-}
} // namespace opencl
} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h
index e9b3e4a5ef..e1598cb870 100644
--- a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h
+++ b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h
@@ -99,12 +99,6 @@ public:
*/
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-
-private:
- std::unique_ptr<ICLKernel> _border_handler{ nullptr };
};
/** Basic function to run @ref opencl::ClComplexPixelWiseMultiplication. */
@@ -132,12 +126,6 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
- // Inherited methods overridden:
- void run(ITensorPack &tensors) override;
-
-private:
- std::unique_ptr<ICLKernel> _border_handler{ nullptr };
};
} // namespace opencl
} // namespace arm_compute
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index 70e618efa1..f466332e3c 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,27 +91,24 @@ TEST_SUITE(PixelWiseMultiplication)
DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), // Window shrink
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid scale
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid data type combination
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
}),
framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
})),
framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
})),
- framework::dataset::make("Scale",{ 2.f, 2.f, 2.f, -1.f, 1.f, 1.f})),
- framework::dataset::make("Expected", { true, true, false, false, false, false})),
+ framework::dataset::make("Scale",{ 2.f, 2.f, -1.f, 1.f, 1.f})),
+ framework::dataset::make("Expected", { true, true, false, false, false})),
input1_info, input2_info, output_info, scale, expected)
{
bool has_error = bool(CLPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO));