aboutsummaryrefslogtreecommitdiff
path: root/src/core/gpu
diff options
context:
space:
mode:
authorSheri Zhang <sheri.zhang@arm.com>2021-06-29 17:34:06 +0100
committerSheri Zhang <sheri.zhang@arm.com>2021-07-13 15:36:03 +0000
commita387e271b1e02ffd5c2993702b9a21c1ed5c95fa (patch)
treef53416756c70c85d962218168ad3cd3359d9f5c8 /src/core/gpu
parent6fc7d528382716de9e417c9dcf0fddf109446e9f (diff)
downloadComputeLibrary-a387e271b1e02ffd5c2993702b9a21c1ed5c95fa.tar.gz
Add in-place calculation support for CL elementwise arithmetic kernels
- Add in-place calculation support in ClArithmeticKernel, ClSaturatedArithmeticKernel and ClMulKernel - Add in-place test cases Resolves: COMPMID-4431 Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: Id484bdb76b74478a33fedb471ae0c7f799c599f6 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5885 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/gpu')
-rw-r--r--src/core/gpu/cl/kernels/ClElementwiseKernel.cpp50
-rw-r--r--src/core/gpu/cl/kernels/ClElementwiseKernel.h2
-rw-r--r--src/core/gpu/cl/kernels/ClMulKernel.cpp32
-rw-r--r--src/core/gpu/cl/kernels/ClMulKernel.h6
4 files changed, 81 insertions, 9 deletions
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
index f005e9226e..3d9f0b6fcf 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -75,6 +75,21 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const
return config_id;
}
+Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape)
+{
+ if(in_place)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+ "Wrong shape for dst, cannot do in_place calculation");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+ "Wrong shape for dst");
+ }
+ return Status{};
+}
+
Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
@@ -82,6 +97,10 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
+ // Check whether it is in_place calculation
+ const bool in_place = (&src1 == &dst) || (&src2 == &dst);
+ const bool src1_in_place = in_place && (&src1 == &dst);
+
const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -91,8 +110,7 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
- "Wrong shape for dst");
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
}
return Status{};
@@ -105,6 +123,10 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+ // Check whether it is in_place calculation
+ const bool in_place = (src1 == dst) || (src2 == dst);
+ const bool src1_in_place = in_place && (src1 == dst);
+
const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -114,8 +136,7 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
- "Wrong shape for dst");
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
}
return Status{};
@@ -137,6 +158,10 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero");
}
+ // Check whether it is in_place calculation
+ const bool in_place = (&src1 == &dst) || (&src2 == &dst);
+ const bool src1_in_place = in_place && (&src1 == &dst);
+
const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -145,6 +170,7 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst");
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
if(is_data_type_quantized_symmetric(dst.data_type()))
{
@@ -182,6 +208,12 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s
}
build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32");
+ // Check whether it is in_place calculation
+ const bool in_place = (&src1 == &dst) || (&src2 == &dst);
+ const bool src1_in_place = in_place && (&src1 == &dst);
+ build_opts.add_option_if(in_place, "-DIN_PLACE");
+ build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
return build_opts;
}
@@ -267,6 +299,8 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
const TensorShape &in_shape1 = src_0->info()->tensor_shape();
const TensorShape &in_shape2 = src_1->info()->tensor_shape();
const TensorShape &out_shape = dst->info()->tensor_shape();
@@ -291,12 +325,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
Window slice = collapsed.first_slice_window_3D();
Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ // Check whether it is in_place calculation
+ const bool in_place = (src_0 == dst) || (src_1 == dst);
do
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, src_0, slice_src1);
add_3D_tensor_argument(idx, src_1, slice_src2);
- add_3D_tensor_argument(idx, dst, slice);
+ if(!in_place)
+ {
+ add_3D_tensor_argument(idx, dst, slice);
+ }
enqueue(queue, *this, slice, lws_hint());
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.h b/src/core/gpu/cl/kernels/ClElementwiseKernel.h
index ab5c777ae6..4525cec55b 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.h
+++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.h
@@ -40,6 +40,8 @@ namespace kernels
* Element-wise operation is computed by:
* @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f]
*
+ * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
+ *
*/
class ClElementwiseKernel : public IClKernel
{
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp
index 65f3bec099..7c4dddc20e 100644
--- a/src/core/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClMulKernel.cpp
@@ -63,6 +63,10 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+ // Check whether it is in_place calculation
+ const bool in_place = (src1 == dst) || (src2 == dst);
+ const bool src1_in_place = in_place && (src1 == dst);
+
const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -85,7 +89,16 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
"Dst can only be QSYMM16 if both src are QSYMM16");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
"Dst must be S32 if source tensors are S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+ if(in_place)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+ "Wrong shape for dst, cannot do in_place calculation");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+ "Wrong shape for dst");
+ }
}
return Status{};
@@ -194,11 +207,17 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
}
}
+ // Check whether it is in_place calculation
+ const bool in_place = (src1 == dst) || (src2 == dst);
+ const bool src1_in_place = in_place && (src1 == dst);
+ build_opts.add_option_if(in_place, "-DIN_PLACE");
+ build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set scale argument
- unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+ unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
if(scale_int >= 0 && !is_quantized)
{
@@ -256,6 +275,8 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
const TensorShape &in_shape1 = src_0->info()->tensor_shape();
const TensorShape &in_shape2 = src_1->info()->tensor_shape();
const TensorShape &out_shape = dst->info()->tensor_shape();
@@ -280,12 +301,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+ // Check whether it is in_place calculation
+ const bool in_place = (src_0 == dst) || (src_1 == dst);
do
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, src_0, slice_input1);
add_3D_tensor_argument(idx, src_1, slice_input2);
- add_3D_tensor_argument(idx, dst, slice);
+ if(!in_place)
+ {
+ add_3D_tensor_argument(idx, dst, slice);
+ }
enqueue(queue, *this, slice, lws_hint());
ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h
index dec8dba61c..2ee182b932 100644
--- a/src/core/gpu/cl/kernels/ClMulKernel.h
+++ b/src/core/gpu/cl/kernels/ClMulKernel.h
@@ -34,7 +34,11 @@ namespace opencl
{
namespace kernels
{
-/** Interface for the pixelwise multiplication kernel. */
+/** Interface for the pixelwise multiplication kernel.
+ *
+ * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
+ *
+*/
class ClMulKernel : public IClKernel
{
public: