From a387e271b1e02ffd5c2993702b9a21c1ed5c95fa Mon Sep 17 00:00:00 2001 From: Sheri Zhang Date: Tue, 29 Jun 2021 17:34:06 +0100 Subject: Add in-place calculation support for CL elementwise arithmetic kernels - Add in-place calculation support in ClArithmeticKernel, ClSaturatedArithmeticKernel and ClMulKernel - Add in-place test cases Resolves: COMPMID-4431 Signed-off-by: Sheri Zhang Change-Id: Id484bdb76b74478a33fedb471ae0c7f799c599f6 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5885 Comments-Addressed: Arm Jenkins Reviewed-by: SiCong Li Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/elementwise_operation.cl | 21 +++++++-- .../cl_kernels/elementwise_operation_quantized.cl | 21 +++++++-- src/core/CL/cl_kernels/pixelwise_mul_float.cl | 13 +++++- src/core/CL/cl_kernels/pixelwise_mul_int.cl | 26 ++++++++++- src/core/gpu/cl/kernels/ClElementwiseKernel.cpp | 50 +++++++++++++++++++--- src/core/gpu/cl/kernels/ClElementwiseKernel.h | 2 + src/core/gpu/cl/kernels/ClMulKernel.cpp | 32 ++++++++++++-- src/core/gpu/cl/kernels/ClMulKernel.h | 6 ++- 8 files changed, 153 insertions(+), 18 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl index 99f725645d..45dcbfc6e2 100644 --- a/src/core/CL/cl_kernels/elementwise_operation.cl +++ b/src/core/CL/cl_kernels/elementwise_operation.cl @@ -97,8 +97,12 @@ */ __kernel void OP_FUN_NAME(OP)( TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) + TENSOR3D_DECLARATION(in2) +#if !defined(IN_PLACE) + , + TENSOR3D_DECLARATION(out) +#endif // !defined(IN_PLACE) +) { #if VEC_SIZE_IN1 == 1 uint in1_x_offs = 0; @@ -110,12 +114,23 @@ __kernel void OP_FUN_NAME(OP)( #else // VEC_SIZE_IN2 == 1 uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0); #endif // VEC_SIZE_IN2 == 1 +#if !defined(IN_PLACE) uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0); +#endif // !defined(IN_PLACE) // Get pixels pointer __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z; __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z; - __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z; + __global uchar * +#if !defined(IN_PLACE) + out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z; +#else // !defined(IN_PLACE) +#if defined(SRC1_IN_PLACE) + out_addr = in1_addr; +#else //defined(SRC1_IN_PLACE) + out_addr = in2_addr; +#endif //defined(SRC1_IN_PLACE) +#endif // !defined(IN_PLACE) // Load values VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT) diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl index 0051babf03..a11be80875 100644 --- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl +++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl @@ -86,8 +86,12 @@ */ __kernel void OP_FUN_NAME(OP)( TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) + TENSOR3D_DECLARATION(in2) +#if !defined(IN_PLACE) + , + TENSOR3D_DECLARATION(out) +#endif // !defined(IN_PLACE) +) { #if VEC_SIZE_IN1 == 1 uint in1_x_offs = 0; @@ -99,12 +103,23 @@ __kernel void OP_FUN_NAME(OP)( #else // VEC_SIZE_IN2 == 1 uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0); #endif // VEC_SIZE_IN2 == 1 +#if !defined(IN_PLACE) uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0); +#endif // !defined(IN_PLACE) // Get pixels pointer __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z; __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z; - __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z; + __global uchar * +#if !defined(IN_PLACE) + out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z; +#else // !defined(IN_PLACE) +#if defined(SRC1_IN_PLACE) + out_addr = in1_addr; +#else //defined(SRC1_IN_PLACE) + out_addr = in2_addr; +#endif //defined(SRC1_IN_PLACE) +#endif // !defined(IN_PLACE) VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_INT); VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_INT); diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl index 0016775893..10875293a9 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl +++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl @@ -77,7 +77,9 @@ __kernel void pixelwise_mul_float( TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), +#if !defined(IN_PLACE) TENSOR3D_DECLARATION(out), +#endif // !defined(IN_PLACE) const float scale) { // Get pixels pointer @@ -87,7 +89,16 @@ __kernel void pixelwise_mul_float( __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z; __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z; - __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z; + __global uchar * +#if !defined(IN_PLACE) + out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z; +#else // !defined(IN_PLACE) +#if defined(SRC1_IN_PLACE) + out_addr = in1_addr; +#else //defined(SRC1_IN_PLACE) + out_addr = in2_addr; +#endif //defined(SRC1_IN_PLACE) +#endif // !defined(IN_PLACE) // Load data VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE); diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl index ac5cabcb8c..6d1c2d0c79 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl +++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl @@ -76,7 +76,9 @@ __kernel void pixelwise_mul_int( TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), +#if !defined(IN_PLACE) TENSOR3D_DECLARATION(out), +#endif // !defined(IN_PLACE) const uint scale) { size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0); @@ -85,7 +87,16 @@ __kernel void pixelwise_mul_int( __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z; __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z; - __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z; + __global uchar * +#if !defined(IN_PLACE) + out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z; +#else // !defined(IN_PLACE) +#if defined(SRC1_IN_PLACE) + out_addr = in1_addr; +#else //defined(SRC1_IN_PLACE) + out_addr = in2_addr; +#endif //defined(SRC1_IN_PLACE) +#endif // !defined(IN_PLACE) // Load data VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE); @@ -143,7 +154,9 @@ __kernel void pixelwise_mul_int( __kernel void pixelwise_mul_quantized( TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), +#if !defined(IN_PLACE) TENSOR3D_DECLARATION(out), +#endif // !defined(IN_PLACE) const float scale) { size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0); @@ -152,7 +165,16 @@ __kernel void pixelwise_mul_quantized( __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z; __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z; - __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z; + __global uchar * +#if !defined(IN_PLACE) + out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z; +#else // !defined(IN_PLACE) +#if defined(SRC1_IN_PLACE) + out_addr = in1_addr; +#else //defined(SRC1_IN_PLACE) + out_addr = in2_addr; +#endif //defined(SRC1_IN_PLACE) +#endif // !defined(IN_PLACE) // Load data VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT); diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp index f005e9226e..3d9f0b6fcf 100644 --- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp +++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp @@ -75,6 +75,21 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const return config_id; } +Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape) +{ + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + return Status{}; +} + Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst); @@ -82,6 +97,10 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); + // Check whether it is in_place calculation + const bool in_place = (&src1 == &dst) || (&src2 == &dst); + const bool src1_in_place = in_place && (&src1 == &dst); + const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -91,8 +110,7 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), - "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); } return Status{}; @@ -105,6 +123,10 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -114,8 +136,7 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), - "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); } return Status{}; @@ -137,6 +158,10 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero"); } + // Check whether it is in_place calculation + const bool in_place = (&src1 == &dst) || (&src2 == &dst); + const bool src1_in_place = in_place && (&src1 == &dst); + const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -145,6 +170,7 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); if(is_data_type_quantized_symmetric(dst.data_type())) { @@ -182,6 +208,12 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s } build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32"); + // Check whether it is in_place calculation + const bool in_place = (&src1 == &dst) || (&src2 == &dst); + const bool src1_in_place = in_place && (&src1 == &dst); + build_opts.add_option_if(in_place, "-DIN_PLACE"); + build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); + return build_opts; } @@ -267,6 +299,8 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); const TensorShape &in_shape2 = src_1->info()->tensor_shape(); const TensorShape &out_shape = dst->info()->tensor_shape(); @@ -291,12 +325,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c Window slice = collapsed.first_slice_window_3D(); Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + // Check whether it is in_place calculation + const bool in_place = (src_0 == dst) || (src_1 == dst); do { unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_src1); add_3D_tensor_argument(idx, src_1, slice_src2); - add_3D_tensor_argument(idx, dst, slice); + if(!in_place) + { + add_3D_tensor_argument(idx, dst, slice); + } enqueue(queue, *this, slice, lws_hint()); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1)); diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.h b/src/core/gpu/cl/kernels/ClElementwiseKernel.h index ab5c777ae6..4525cec55b 100644 --- a/src/core/gpu/cl/kernels/ClElementwiseKernel.h +++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.h @@ -40,6 +40,8 @@ namespace kernels * Element-wise operation is computed by: * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f] * + * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead. + * */ class ClElementwiseKernel : public IClKernel { diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp index 65f3bec099..7c4dddc20e 100644 --- a/src/core/gpu/cl/kernels/ClMulKernel.cpp +++ b/src/core/gpu/cl/kernels/ClMulKernel.cpp @@ -63,6 +63,10 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -85,7 +89,16 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons "Dst can only be QSYMM16 if both src are QSYMM16"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32), "Dst must be S32 if source tensors are S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); + } } return Status{}; @@ -194,11 +207,17 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo } } + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + build_opts.add_option_if(in_place, "-DIN_PLACE"); + build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); + // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set scale argument - unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters if(scale_int >= 0 && !is_quantized) { @@ -256,6 +275,8 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); const TensorShape &in_shape2 = src_1->info()->tensor_shape(); const TensorShape &out_shape = dst->info()->tensor_shape(); @@ -280,12 +301,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + // Check whether it is in_place calculation + const bool in_place = (src_0 == dst) || (src_1 == dst); do { unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_input1); add_3D_tensor_argument(idx, src_1, slice_input2); - add_3D_tensor_argument(idx, dst, slice); + if(!in_place) + { + add_3D_tensor_argument(idx, dst, slice); + } enqueue(queue, *this, slice, lws_hint()); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h index dec8dba61c..2ee182b932 100644 --- a/src/core/gpu/cl/kernels/ClMulKernel.h +++ b/src/core/gpu/cl/kernels/ClMulKernel.h @@ -34,7 +34,11 @@ namespace opencl { namespace kernels { -/** Interface for the pixelwise multiplication kernel. */ +/** Interface for the pixelwise multiplication kernel. + * + * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead. + * +*/ class ClMulKernel : public IClKernel { public: -- cgit v1.2.1