aboutsummaryrefslogtreecommitdiff
path: root/src/core/gpu
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2021-04-16 17:03:39 +0100
committerGiorgio Arena <giorgio.arena@arm.com>2021-04-20 09:26:59 +0000
commitada6cbc057ff725e57d301a99a1816ce602485b9 (patch)
treef869994cb2b061de0bc4731d720336413b81d32a /src/core/gpu
parent031d6a97de79fc3ca3eb6fca1611f03aa9b5893b (diff)
downloadComputeLibrary-ada6cbc057ff725e57d301a99a1816ce602485b9.tar.gz
Remove OpenCL padding: CLPixelWiseMultiplicationKernel
- Change kernel's vec_size to 16 / sizeof(output) - Change ICLKernel.cpp to handle broadcast without padding Resolve COMPMID-3913 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: I03e884b250ef5784dc109bff8cf2c96b345d119f Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5450 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core/gpu')
-rw-r--r--src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp129
-rw-r--r--src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h12
2 files changed, 24 insertions, 117 deletions
diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
index 56997dc8ad..14e45b2e6d 100644
--- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp
@@ -42,8 +42,6 @@ namespace kernels
{
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
{
@@ -92,60 +90,6 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- // Auto initialize dst if not initialized
- {
- set_shape_if_empty(*dst, out_shape);
-
- if(src1->data_type() == DataType::S16 || src2->data_type() == DataType::S16)
- {
- set_format_if_unknown(*dst, Format::S16);
- }
- else if(src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*dst, Format::F32);
- }
- else if(src1->data_type() == DataType::QASYMM8)
- {
- set_data_type_if_unknown(*dst, DataType::QASYMM8);
- }
- else if(src1->data_type() == DataType::QASYMM8_SIGNED)
- {
- set_data_type_if_unknown(*dst, DataType::QASYMM8_SIGNED);
- }
- else if(src1->data_type() == DataType::QSYMM16)
- {
- set_data_type_if_unknown(*dst, DataType::QSYMM16);
- }
- }
-
- Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*src1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*src2);
-
- AccessWindowHorizontal input1_access(src1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(src2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-BorderSize calc_border_size(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- const unsigned int replicateSize = dst->dimension(0) - std::min(src1->dimension(0), src2->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-
- return BorderSize{ 0, border, 0, 0 };
-}
} // namespace
void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
@@ -155,12 +99,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
scale, overflow_policy, rounding_policy, act_info));
- // Calculate border size
- _border_size = calc_border_size(src1, src2, dst);
+ auto padding_info = get_padding_info({ src1, src2, dst });
- // Configure kernel window
- auto win_config = validate_and_configure_window(src1, src2, dst);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
int scale_int = -1;
// Extract sign, exponent and mantissa
@@ -197,7 +139,9 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
}
}
- const bool is_quantized = is_data_type_quantized(src1->data_type());
+ const bool is_quantized = is_data_type_quantized(src1->data_type());
+ const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
+ const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
// Set kernel build options
std::string kernel_name = "pixelwise_mul";
@@ -205,7 +149,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+ build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+ build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
if(is_quantized && (dst->data_type() != DataType::S32))
{
const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
@@ -252,7 +199,10 @@ void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_
_kernel.setArg(idx++, scale);
}
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*dst, Steps(vec_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
@@ -260,7 +210,6 @@ Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src1->clone().get(), src2->clone().get(), dst->clone().get()).first);
return Status{};
}
@@ -312,14 +261,9 @@ void ClPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window
while(collapsed.slide_window_slice_3D(slice));
}
-BorderSize ClPixelWiseMultiplicationKernel::border_size() const
-{
- return _border_size;
-}
-
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
+constexpr unsigned int vec_size_complex = 1;
Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
@@ -342,30 +286,6 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- // Auto initialize dst if not initialized
- const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
- auto_init_if_empty(*dst, out_info);
-
- Window win = calculate_max_window(out_shape, Steps(num_elems_processed_per_iteration_complex));
- Window win_input1 = win.broadcast_if_dimension_le_one(*src1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*src2);
-
- AccessWindowHorizontal input1_access(src1, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal input2_access(src2, 0, num_elems_processed_per_iteration_complex);
- AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration_complex);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
@@ -373,12 +293,10 @@ void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &c
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
- // Calculate border size
- _border_size = calc_border_size(src1, src2, dst);
+ auto padding_info = get_padding_info({ src1, src2, dst });
- // Configure kernel window
- auto win_config = validate_and_configure_window_complex(src1, src2, dst);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -392,14 +310,16 @@ void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &c
// Create kernel
_kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*dst, Steps(vec_size_complex));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status ClComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(src1->clone().get(), src2->clone().get(), dst->clone().get()).first);
return Status{};
}
@@ -450,11 +370,6 @@ void ClComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const
}
while(collapsed.slide_window_slice_3D(slice));
}
-
-BorderSize ClComplexPixelWiseMultiplicationKernel::border_size() const
-{
- return _border_size;
-}
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
index 5889b84938..5b827262a1 100644
--- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
+++ b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h
@@ -41,7 +41,7 @@ public:
/** Default constructor */
ClPixelWiseMultiplicationKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPixelWiseMultiplicationKernel);
- /** Initialise the kernel's src, dst and border mode.
+ /** Initialise the kernel's src and dst.
*
* Valid configurations (Input1,Input2) -> Output :
*
@@ -101,10 +101,6 @@ public:
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-public:
- BorderSize _border_size{};
};
/** Interface for the complex pixelwise multiplication kernel. */
@@ -114,7 +110,7 @@ public:
/** Default constructor */
ClComplexPixelWiseMultiplicationKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexPixelWiseMultiplicationKernel);
- /** Initialise the kernel's src, dst and border mode.
+ /** Initialise the kernel's src and dst.
*
* @param[in] compile_context The compile context to be used.
* @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2.
@@ -136,10 +132,6 @@ public:
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
- BorderSize border_size() const override;
-
-public:
- BorderSize _border_size{};
};
} // namespace kernels
} // namespace opencl