From ed2a8ed7efb54ae0ec539b8735820b2c30756c44 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 7 Oct 2020 17:17:16 +0100 Subject: COMPMID-3715: Remove OpenCL padding - CLElementwiseOperationKernel Change-Id: I8a685d4ac5de747a0f775bd10be9c411cf394953 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4140 Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Giorgio Arena Reviewed-by: Sang-Hoon Park Tested-by: Arm Jenkins --- .../CL/kernels/CLElementwiseOperationKernel.cpp | 50 +++++++--------------- 1 file changed, 15 insertions(+), 35 deletions(-) (limited to 'src/core/CL/kernels/CLElementwiseOperationKernel.cpp') diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp index f0712f5863..da28f3d886 100644 --- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp +++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp @@ -36,7 +36,7 @@ namespace arm_compute { namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; +constexpr unsigned int vector_size_byte_opencl = 16; std::map supported_arithmetic_ops = { @@ -152,10 +152,15 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &i { CLBuildOptions build_opts; + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output.element_size(), output.dimension(0)); + build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type())); build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(input1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(input2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output.dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DOP=" + operation_string); if(is_data_type_quantized(input1.data_type())) { @@ -173,31 +178,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &i return build_opts; } -std::pair configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +std::pair configure_window_arithmetic_common(ITensorInfo &output) { - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(input1); - Window win_input2 = win.broadcast_if_dimension_le_one(input2); - - AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output.element_size(), output.dimension(0)); + Window win = calculate_max_window(output, Steps(num_elems_processed_per_iteration)); + return std::make_pair(Status{}, win); } std::pair validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; + const TensorShape &out_shape = broadcast_pair.first; set_shape_if_empty(output, out_shape); @@ -226,16 +217,15 @@ std::pair validate_and_configure_window_for_arithmetic_operators set_data_type_if_unknown(output, DataType::QSYMM16); } - return configure_window_arithmetic_common(valid_region, input1, input2, output); + return configure_window_arithmetic_common(output); } std::pair validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; + const TensorShape &out_shape = broadcast_pair.first; auto_init_if_empty(output, out_shape, 1, input1.data_type()); - return configure_window_arithmetic_common(valid_region, input1, input2, output); + return configure_window_arithmetic_common(output); } } // namespace @@ -315,30 +305,20 @@ void CLElementwiseOperationKernel::run_op(ITensorPack &tensors, const Window &wi Window slice = collapsed.first_slice_window_3D(); Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - do { unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_input1); add_3D_tensor_argument(idx, src_1, slice_input2); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); } while(collapsed.slide_window_slice_3D(slice)); } -BorderSize CLElementwiseOperationKernel::border_size() const -{ - const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0)); - const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize{ 0, border, 0, 0 }; -} - /** Arithmetic operations with saturation*/ void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, -- cgit v1.2.1