From ed2a8ed7efb54ae0ec539b8735820b2c30756c44 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 7 Oct 2020 17:17:16 +0100 Subject: COMPMID-3715: Remove OpenCL padding - CLElementwiseOperationKernel Change-Id: I8a685d4ac5de747a0f775bd10be9c411cf394953 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4140 Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Giorgio Arena Reviewed-by: Sang-Hoon Park Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/elementwise_operation.cl | 49 ++++++++++------- .../cl_kernels/elementwise_operation_quantized.cl | 63 +++++++++++++--------- .../CL/kernels/CLElementwiseOperationKernel.cpp | 50 ++++++----------- .../CL/functions/CLElementwiseOperations.cpp | 62 --------------------- src/runtime/CL/functions/CLPReluLayer.cpp | 37 ------------- 5 files changed, 84 insertions(+), 177 deletions(-) (limited to 'src') diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl index 26826e9b8a..211eb38dca 100644 --- a/src/core/CL/cl_kernels/elementwise_operation.cl +++ b/src/core/CL/cl_kernels/elementwise_operation.cl @@ -43,7 +43,7 @@ #define OP_FUN_NAME_STR(op) elementwise_operation_##op #define OP_FUN_NAME(op) OP_FUN_NAME_STR(op) -#if defined(OP) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) +#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) #if defined(ACTIVATION_TYPE) #include "activation_float_helpers.h" @@ -51,11 +51,12 @@ /** This function executes an element-wise operation among two tensors. * - * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT. + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT + * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD) + * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD) * * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32 * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) @@ -87,24 +88,36 @@ __kernel void OP_FUN_NAME(OP)( TENSOR3D_DECLARATION(in2), TENSOR3D_DECLARATION(out)) { +#if VEC_SIZE_IN1 == 1 + uint in1_x_offs = 0; +#else // VEC_SIZE_IN1 == 1 + uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0); +#endif // VEC_SIZE_IN1 == 1 +#if VEC_SIZE_IN2 == 1 + uint in2_x_offs = 0; +#else // VEC_SIZE_IN2 == 1 + uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0); +#endif // VEC_SIZE_IN2 == 1 + uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0); + // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_IN1) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z; + __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_IN2) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z; + __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z; // Load values - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT) + in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)); + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT) + in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)); // Calculate and store result + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT) + res0 = OP(in_a, in_b); #if defined(ACTIVATION_TYPE) - VSTORE(VEC_SIZE) - (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE, CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr); -#else // defined(ACTIVATION_TYPE) - VSTORE(VEC_SIZE) - (OP(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); + res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL); #endif // defined(ACTIVATION_TYPE) + + STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } -#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */ +#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) */ diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl index eb57da828d..9edfb4f9a2 100644 --- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl +++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,25 +37,27 @@ #define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized #define OP_FUN_NAME(op) OP_FUN_NAME_STR(op) -#if defined(OP) && defined(VEC_SIZE) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) +#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) -#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT) +#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT) /** This function executes an element-wise operation among two tensors. * - * @attention The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10 - * @attention The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10 - * @attention The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10 - * @attention The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10 - * @attention The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10 - * @attention The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10 - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD) - * @attention For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero - * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar + * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT. + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING) + * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10 + * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10 + * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10 + * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10 + * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10 + * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10 + * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD) + * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero + * @note The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar * * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16 * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) @@ -87,13 +89,25 @@ __kernel void OP_FUN_NAME(OP)( TENSOR3D_DECLARATION(in2), TENSOR3D_DECLARATION(out)) { +#if VEC_SIZE_IN1 == 1 + uint in1_x_offs = 0; +#else // VEC_SIZE_IN1 == 1 + uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0); +#endif // VEC_SIZE_IN1 == 1 +#if VEC_SIZE_IN2 == 1 + uint in2_x_offs = 0; +#else // VEC_SIZE_IN2 == 1 + uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0); +#endif // VEC_SIZE_IN2 == 1 + uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0); + // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z; + __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z; + __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z; - VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT); - VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT); + VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT); + VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT); in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); @@ -101,10 +115,9 @@ __kernel void OP_FUN_NAME(OP)( const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT)); - const VEC_TYPE res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE); + const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE); // Store result - VSTORE(VEC_SIZE) - (res, 0, (__global DATA_TYPE_OUT *)out.ptr); + STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } -#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */ +#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */ diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp index f0712f5863..da28f3d886 100644 --- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp +++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp @@ -36,7 +36,7 @@ namespace arm_compute { namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; +constexpr unsigned int vector_size_byte_opencl = 16; std::map supported_arithmetic_ops = { @@ -152,10 +152,15 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &i { CLBuildOptions build_opts; + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output.element_size(), output.dimension(0)); + build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type())); build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(input1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(input2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output.dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DOP=" + operation_string); if(is_data_type_quantized(input1.data_type())) { @@ -173,31 +178,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &i return build_opts; } -std::pair configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +std::pair configure_window_arithmetic_common(ITensorInfo &output) { - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(input1); - Window win_input2 = win.broadcast_if_dimension_le_one(input2); - - AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output.element_size(), output.dimension(0)); + Window win = calculate_max_window(output, Steps(num_elems_processed_per_iteration)); + return std::make_pair(Status{}, win); } std::pair validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; + const TensorShape &out_shape = broadcast_pair.first; set_shape_if_empty(output, out_shape); @@ -226,16 +217,15 @@ std::pair validate_and_configure_window_for_arithmetic_operators set_data_type_if_unknown(output, DataType::QSYMM16); } - return configure_window_arithmetic_common(valid_region, input1, input2, output); + return configure_window_arithmetic_common(output); } std::pair validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; + const TensorShape &out_shape = broadcast_pair.first; auto_init_if_empty(output, out_shape, 1, input1.data_type()); - return configure_window_arithmetic_common(valid_region, input1, input2, output); + return configure_window_arithmetic_common(output); } } // namespace @@ -315,30 +305,20 @@ void CLElementwiseOperationKernel::run_op(ITensorPack &tensors, const Window &wi Window slice = collapsed.first_slice_window_3D(); Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - do { unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_input1); add_3D_tensor_argument(idx, src_1, slice_input2); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); } while(collapsed.slide_window_slice_3D(slice)); } -BorderSize CLElementwiseOperationKernel::border_size() const -{ - const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0)); - const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize{ 0, border, 0, 0 }; -} - /** Arithmetic operations with saturation*/ void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp index 6f664725c5..7b4d3c629d 100644 --- a/src/runtime/CL/functions/CLElementwiseOperations.cpp +++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp @@ -32,43 +32,9 @@ namespace arm_compute { -namespace -{ -void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output) -{ - if(output->dimension(0) > 1) - { - ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2; - - if(broadcasted_info->dimension(0) == 1) - { - border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE); - } - } -} - -ITensorPack select_border_input(ITensorPack &tensors) -{ - ITensorPack pack; - if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1) - { - if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1) - { - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1)); - } - else - { - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0)); - } - } - return pack; -} -} // namespace - namespace experimental { CLArithmeticAddition::CLArithmeticAddition() - : _border_handler() { } @@ -77,7 +43,6 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, IT auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) @@ -87,13 +52,10 @@ Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorIn void CLArithmeticAddition::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } CLArithmeticSubtraction::CLArithmeticSubtraction() - : _border_handler() { } void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, @@ -102,7 +64,6 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) @@ -113,13 +74,10 @@ Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITenso void CLArithmeticSubtraction::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } CLArithmeticDivision::CLArithmeticDivision() - : _border_handler() { } @@ -128,7 +86,6 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, IT auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -138,13 +95,10 @@ Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorIn void CLArithmeticDivision::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } CLElementwiseMax::CLElementwiseMax() - : _border_handler() { } @@ -153,7 +107,6 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ITenso auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -163,13 +116,10 @@ Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo * void CLElementwiseMax::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } CLElementwiseMin::CLElementwiseMin() - : _border_handler() { } @@ -178,7 +128,6 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ITenso auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -188,13 +137,10 @@ Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo * void CLElementwiseMin::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() - : _border_handler() { } @@ -203,7 +149,6 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -213,13 +158,10 @@ Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITens void CLElementwiseSquaredDiff::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } CLElementwisePower::CLElementwisePower() - : _border_handler() { } @@ -228,7 +170,6 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ITen auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output); } Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) @@ -238,8 +179,6 @@ Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo void CLElementwisePower::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } } // namespace experimental @@ -477,7 +416,6 @@ struct CLElementwiseSquaredDiff::Impl const ICLTensor *src_1{ nullptr }; ICLTensor *dst{ nullptr }; std::unique_ptr op{ nullptr }; - std::unique_ptr _border_handler{ nullptr }; }; CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp index e03bd13284..aaddd46071 100644 --- a/src/runtime/CL/functions/CLPReluLayer.cpp +++ b/src/runtime/CL/functions/CLPReluLayer.cpp @@ -30,43 +30,9 @@ namespace arm_compute { -namespace -{ -void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output) -{ - if(output->dimension(0) > 1) - { - ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2; - - if(broadcasted_info->dimension(0) == 1) - { - border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE); - } - } -} - -ITensorPack select_border_input(ITensorPack &tensors) -{ - ITensorPack pack; - if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1) - { - if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1) - { - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1)); - } - else - { - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0)); - } - } - return pack; -} -} // namespace - namespace experimental { CLPReluLayer::CLPReluLayer() - : _border_handler() { } @@ -75,7 +41,6 @@ void CLPReluLayer::configure(const CLCompileContext &compile_context, ITensorInf auto k = arm_compute::support::cpp14::make_unique(); k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output); _kernel = std::move(k); - configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output); } Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) @@ -85,8 +50,6 @@ Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha void CLPReluLayer::run(ITensorPack &tensors) { - auto border_pack = select_border_input(tensors); - CLScheduler::get().enqueue_op(_border_handler, border_pack); ICLOperator::run(tensors); } } // namespace experimental -- cgit v1.2.1