diff options
author | Sheri Zhang <sheri.zhang@arm.com> | 2021-06-29 17:34:06 +0100 |
---|---|---|
committer | Sheri Zhang <sheri.zhang@arm.com> | 2021-07-13 15:36:03 +0000 |
commit | a387e271b1e02ffd5c2993702b9a21c1ed5c95fa (patch) | |
tree | f53416756c70c85d962218168ad3cd3359d9f5c8 /src/core/gpu/cl/kernels/ClMulKernel.cpp | |
parent | 6fc7d528382716de9e417c9dcf0fddf109446e9f (diff) | |
download | ComputeLibrary-a387e271b1e02ffd5c2993702b9a21c1ed5c95fa.tar.gz |
Add in-place calculation support for CL elementwise arithmetic kernels
- Add in-place calculation support in ClArithmeticKernel, ClSaturatedArithmeticKernel and ClMulKernel
- Add in-place test cases
Resolves: COMPMID-4431
Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: Id484bdb76b74478a33fedb471ae0c7f799c599f6
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5885
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/gpu/cl/kernels/ClMulKernel.cpp')
-rw-r--r-- | src/core/gpu/cl/kernels/ClMulKernel.cpp | 32 |
1 files changed, 29 insertions, 3 deletions
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp index 65f3bec099..7c4dddc20e 100644 --- a/src/core/gpu/cl/kernels/ClMulKernel.cpp +++ b/src/core/gpu/cl/kernels/ClMulKernel.cpp @@ -63,6 +63,10 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -85,7 +89,16 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons "Dst can only be QSYMM16 if both src are QSYMM16"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32), "Dst must be S32 if source tensors are S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); + } } return Status{}; @@ -194,11 +207,17 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo } } + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + build_opts.add_option_if(in_place, "-DIN_PLACE"); + build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); + // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set scale argument - unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters if(scale_int >= 0 && !is_quantized) { @@ -256,6 +275,8 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); const TensorShape &in_shape2 = src_1->info()->tensor_shape(); const TensorShape &out_shape = dst->info()->tensor_shape(); @@ -280,12 +301,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + // Check whether it is in_place calculation + const bool in_place = (src_0 == dst) || (src_1 == dst); do { unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_input1); add_3D_tensor_argument(idx, src_1, slice_input2); - add_3D_tensor_argument(idx, dst, slice); + if(!in_place) + { + add_3D_tensor_argument(idx, dst, slice); + } enqueue(queue, *this, slice, lws_hint()); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); |