From 4112eed70d110376674609af92e76c68ae8b3a39 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Fri, 23 Oct 2020 14:24:26 +0100 Subject: COMPMID-3731 Remove OpenCL padding: CLHeightConcatenateLayerKernel Signed-off-by: Giorgio Arena Change-Id: I004128fdcc1207c25d2b959f17f04f9e1a8b4cb5 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4247 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- .../CL/kernels/CLHeightConcatenateLayerKernel.h | 1 - arm_compute/core/Utils.h | 18 +++++++---- src/core/CL/cl_kernels/concatenate.cl | 23 +++++++------- .../CL/kernels/CLHeightConcatenateLayerKernel.cpp | 35 ++++++++-------------- src/core/Utils.cpp | 27 +++++++++++++---- 5 files changed, 59 insertions(+), 45 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h index 4fa2b40881..f362441944 100644 --- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h @@ -72,7 +72,6 @@ public: private: unsigned int _height_offset; - unsigned int _num_elems_processed_per_iteration; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */ diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 681a1a708e..1c02e89ab6 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -45,6 +45,7 @@ namespace arm_compute { class ITensor; +class ITensorInfo; /** Calculate the rounded up quotient of val / m. * @@ -1094,20 +1095,27 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data * @return DataType */ DataType data_type_from_name(const std::string &name); +/** Stores padding information before configuring a kernel + * + * @param[in] infos list of tensor infos to store the padding info for + * + * @return An unordered map where each tensor info pointer is paired with its original padding info + */ +std::unordered_map get_padding_info(std::initializer_list infos); /** Stores padding information before configuring a kernel * * @param[in] tensors list of tensors to store the padding info for * - * @return An unordered map where each tensor pointer is paired with its original padding info + * @return An unordered map where each tensor info pointer is paired with its original padding info */ -std::unordered_map get_padding_info(std::initializer_list tensors); +std::unordered_map get_padding_info(std::initializer_list tensors); /** Check if the previously stored padding info has changed after configuring a kernel * - * @param[in] padding_map an unordered map where each tensor pointer is paired with its original padding info + * @param[in] padding_map an unordered map where each tensor info pointer is paired with its original padding info * - * @return true if any of the tensors has changed its paddings + * @return true if any of the tensor infos has changed its paddings */ -bool has_padding_changed(const std::unordered_map &padding_map); +bool has_padding_changed(const std::unordered_map &padding_map); /** Input Stream operator for @ref DataType * diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl index 0b211a6d1f..0f4b5afe2c 100644 --- a/src/core/CL/cl_kernels/concatenate.cl +++ b/src/core/CL/cl_kernels/concatenate.cl @@ -330,6 +330,8 @@ __kernel void concatenate_width( #endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */ +#if defined(VEC_SIZE_LEFTOVER) + #if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) /** This kernel concatenates the input tensor into the output tensor along the second dimension * @@ -338,6 +340,7 @@ __kernel void concatenate_width( * @note Vector sizes supported are 2,4,8 and 16. * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16 + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -365,26 +368,26 @@ __kernel void concatenate_height( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst)) { - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH); - Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH); + const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id( + 2) / DEPTH) * src_stride_w; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id( + 2) / DEPTH) * dst_stride_w; VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); + source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr); #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) - const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); - VSTORE(VEC_SIZE) - (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y)); + const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) #else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ - VSTORE(VEC_SIZE) - (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y)); + STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ } #endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */ -#if defined(VEC_SIZE_LEFTOVER) - /** This kernel concatenates the input tensor into the output tensor along the third dimension * * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp index 1ae2599721..3f5e91e5a1 100644 --- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp @@ -39,20 +39,6 @@ namespace arm_compute { namespace { -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) -{ - num_elems_processed_per_iteration = 4; - // The window needs to be based on input as we copy all the heights of input - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - bool window_changed = update_window_and_padding(win, input_access, output_access); - - Window win_collapsed = win.collapse(win, Window::DimZ); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win_collapsed); -} Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -72,15 +58,13 @@ Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, } // namespace CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel() - : _height_offset(0), _num_elems_processed_per_iteration() + : _height_offset(0) { } Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output) { - unsigned int num_elems_processed_per_iteration; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first); return Status{}; } @@ -89,16 +73,19 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output)); - _height_offset = height_offset; + auto padding_info = get_padding_info({ input, output }); - auto win_config = validate_and_configure_window(input, output, _num_elems_processed_per_iteration); + _height_offset = height_offset; // Add build options + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->dimension(0)); + CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration)); if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info()) { @@ -115,12 +102,14 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options()); // Configure kernel window - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - ICLKernel::configure_internal(std::get<1>(win_config)); + // The window needs to be based on input as we copy all the heights of input + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); // Set output valid region output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index c877e8fd1f..babf1c4b91 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -495,26 +495,41 @@ std::pair get_quantized_activation_min_max(ActivationLayerInfo return std::make_pair(min_activation, max_activation); } -std::unordered_map get_padding_info(std::initializer_list tensors) +std::unordered_map get_padding_info(std::initializer_list tensors) { - std::unordered_map res; + std::unordered_map res; for(const ITensor *tensor : tensors) { if(tensor) { - res.insert({ tensor, tensor->info()->padding() }); + res.insert({ tensor->info(), tensor->info()->padding() }); } } return res; } -bool has_padding_changed(const std::unordered_map &padding_map) +std::unordered_map get_padding_info(std::initializer_list infos) { - return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair &padding_info) + std::unordered_map res; + + for(const ITensorInfo *info : infos) + { + if(info) + { + res.insert({ info, info->padding() }); + } + } + + return res; +} + +bool has_padding_changed(const std::unordered_map &padding_map) +{ + return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair &padding_info) { - return (padding_info.first->info()->padding() != padding_info.second); + return (padding_info.first->padding() != padding_info.second); }) != padding_map.end(); } -- cgit v1.2.1