From ad7515d231acb075a9585e52f257373b1a1b5d1f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 24 Jul 2020 00:02:23 +0100 Subject: COMPMID-3385: Async support to CLArithmetic* kernels/functions Pt.1 Signed-off-by: Michalis Spyrou Change-Id: I94007565e688f8a0aead4f14c9fc30bfd9f9f7eb Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3613 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../CL/kernels/CLElementwiseOperationKernel.cpp | 45 ++++++++++-------- src/core/CL/kernels/CLFillBorderKernel.cpp | 55 +++++++++++++++++----- 2 files changed, 69 insertions(+), 31 deletions(-) (limited to 'src/core/CL/kernels') diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp index ecac1e0c86..7cc6fb38b1 100644 --- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp +++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "support/StringSupport.h" #include @@ -241,15 +242,15 @@ CLElementwiseOperationKernel::CLElementwiseOperationKernel() { } -void CLElementwiseOperationKernel::configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLElementwiseOperationKernel::configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) { configure_common(CLKernelLibrary::get().get_compile_context(), input1, input2, output); } -void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) { // Configure kernel window - auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info()); + auto win_config = validate_and_configure_window(*input1, *input2, *output); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); _input1 = input1; @@ -257,13 +258,13 @@ void CLElementwiseOperationKernel::configure_common(const CLCompileContext &comp _output = output; std::string kernel_name = "elementwise_operation_" + name(); - if(is_data_type_quantized(input1->info()->data_type())) + if(is_data_type_quantized(input1->data_type())) { kernel_name += "_quantized"; } // Set kernel build options - CLBuildOptions build_opts = generate_build_options(*input1->info(), *input2->info(), *output->info()); + CLBuildOptions build_opts = generate_build_options(*input1, *input2, *output); if(_act_info.enabled()) { build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation()))); @@ -276,17 +277,21 @@ void CLElementwiseOperationKernel::configure_common(const CLCompileContext &comp ICLKernel::configure_internal(win_config.second); - _config_id = generate_id_for_tuning(kernel_name, *input1->info(), *output->info()); + _config_id = generate_id_for_tuning(kernel_name, *input1, *output); } -void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue) +void CLElementwiseOperationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const TensorShape &in_shape1 = _input1->info()->tensor_shape(); - const TensorShape &in_shape2 = _input2->info()->tensor_shape(); - const TensorShape &out_shape = _output->info()->tensor_shape(); + const auto src_0 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_0)); + const auto src_1 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); + const TensorShape &in_shape2 = src_1->info()->tensor_shape(); + const TensorShape &out_shape = dst->info()->tensor_shape(); bool can_collapse = true; const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; @@ -313,9 +318,9 @@ void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &q { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input1, slice_input1); - add_3D_tensor_argument(idx, _input2, slice_input2); - add_3D_tensor_argument(idx, _output, slice); + add_3D_tensor_argument(idx, src_0, slice_input1); + add_3D_tensor_argument(idx, src_1, slice_input2); + add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); @@ -327,25 +332,25 @@ void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &q BorderSize CLElementwiseOperationKernel::border_size() const { - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0)); const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); return BorderSize{ 0, border, 0, 0 }; } /** Arithmetic operations with saturation*/ -void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, +void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, policy, act_info); } -void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, +void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(CLSaturatedArithmeticOperationKernel::validate(op, input1->info(), input2->info(), output->info(), policy, act_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLSaturatedArithmeticOperationKernel::validate(op, input1, input2, output, policy, act_info)); _policy = policy; _op = op; @@ -392,16 +397,16 @@ std::string CLSaturatedArithmeticOperationKernel::name() /** Arithmetic operations*/ -void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info) +void CLArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info) { configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, act_info); } -void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, +void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(CLArithmeticOperationKernel::validate(op, input1->info(), input2->info(), output->info(), act_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLArithmeticOperationKernel::validate(op, input1, input2, output, act_info)); _op = op; _act_info = act_info; diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index 67dac3280e..1fca646129 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -33,6 +33,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "support/StringSupport.h" namespace arm_compute @@ -61,11 +62,17 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo } void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +{ + _tensor = tensor; + configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value); +} + +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); - ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1); + ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1); - border_size.limit(tensor->info()->padding()); + border_size.limit(tensor->padding()); // If there is no border: early exit if(border_size.empty() || border_mode == BorderMode::UNDEFINED) @@ -76,7 +83,7 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT // Select appropriate kernel std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode)); - const DataType dt = tensor->info()->data_type(); + const DataType dt = tensor->data_type(); // Define build options CLBuildOptions build_opts; @@ -88,16 +95,15 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - _tensor = tensor; // Create static kernel arguments - const unsigned int valid_width = tensor->info()->valid_region().shape[0]; - const unsigned int valid_height = tensor->info()->valid_region().shape[1]; + const unsigned int valid_width = tensor->valid_region().shape[0]; + const unsigned int valid_height = tensor->valid_region().shape[1]; const cl_int2 valid_region_coords = { { - static_cast(tensor->info()->valid_region().anchor[0]), - static_cast(tensor->info()->valid_region().anchor[1]), + static_cast(tensor->valid_region().anchor[0]), + static_cast(tensor->valid_region().anchor[1]), } }; const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; @@ -149,7 +155,7 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT Window win; win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height)); win.set(Window::DimY, Window::Dimension(0, 1, 1)); - win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ); + win.use_tensor_dimensions(tensor->tensor_shape(), Window::DimZ); ICLKernel::configure_internal(win); // Set config_id for enabling LWS tuning @@ -157,13 +163,40 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLT _config_id += "_"; _config_id += lower_string(string_from_data_type(dt)); _config_id += "_"; - _config_id += support::cpp11::to_string(tensor->info()->dimension(0)); + _config_id += support::cpp11::to_string(tensor->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(tensor->info()->dimension(1)); + _config_id += support::cpp11::to_string(tensor->dimension(1)); _config_id += "_"; _config_id += lower_string(string_from_border_mode(border_mode)); } +void CLFillBorderKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_UNUSED(outputs); + + // Border mode undefined or border width == 0 + if(_kernel() == nullptr) + { + return; + } + + const auto tensor = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC)); + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, tensor, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} + void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 -- cgit v1.2.1