diff options
author | Renato Arantes <renato.arantes@arm.com> | 2023-04-24 07:19:59 +0000 |
---|---|---|
committer | Renato Barros Arantes <renato.arantes@arm.com> | 2023-05-03 16:12:48 +0000 |
commit | 57132943e0df00aa008b90614ea5a9fa8b2dc18a (patch) | |
tree | a6db3f93c399a6832ca7d487c38572bba0aea22a /src/cpu/kernels | |
parent | cdd1e039ad598aec10d8c1b81e08de9412324bf2 (diff) | |
download | ComputeLibrary-57132943e0df00aa008b90614ea5a9fa8b2dc18a.tar.gz |
Fix im2col for fast-maths mode with padding.
Following the investigation proposed by ONCPUML-1193, padding
is implemented in im2col when the input channel is not a multiple of
blocks requested by the weight format.
Partially resolves: ONCPUML-1193
Signed-off-by: Renato Arantes <renato.arantes@arm.com>
Change-Id: I350c7a1b2dcae63f8d94f5b6f1f86e948eab1f09
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9508
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/kernels')
-rw-r--r-- | src/cpu/kernels/CpuIm2ColKernel.cpp | 170 | ||||
-rw-r--r-- | src/cpu/kernels/CpuIm2ColKernel.h | 44 |
2 files changed, 161 insertions, 53 deletions
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp index 25ff6c291c..9ac291549b 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.cpp +++ b/src/cpu/kernels/CpuIm2ColKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -52,7 +52,7 @@ namespace kernels namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups) + bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); @@ -70,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c if(output->total_size() > 0) { - TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)); + TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -246,6 +246,86 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, *out_ptr = static_cast<T>(1); } } + +template <typename T, bool has_pads> +inline void linearize_volume_nhwc(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int start_x, + int start_y, + int kernel_width, + int kernel_height, + int input_w, + int input_h, + int input_c, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y, + int pad_right) +{ + const int end_x = start_x + kernel_width * dilation_x; + const int end_y = start_y + kernel_height * dilation_y; + const int pad_quant = kernel_width * (input_c + pad_right); + const int element_size = static_cast<int>(sizeof(T)); + const int channel_chunk_size = input_c * element_size; + + if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size)) + { + for(int y = start_y; y < end_y; y += dilation_y) + { + const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); + for(int e = 0; e < kernel_width; e++) + { + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + else + { + for(int y = start_y; y < end_y; y += dilation_y) + { + if(y < 0 || y >= input_h) + { + memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); + out_ptr += pad_quant; + } + else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) + { + for(int x = start_x; x < end_x; x += dilation_x) + { + if(x < 0 || x >= input_w) + { + memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size); + out_ptr += input_c + pad_right; + } + else + { + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + else + { + const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); + for(int e = 0; e < kernel_width; e++) + { + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + } + // Append 1 if the convolution layer has biases + if(has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + } // namespace template <typename T, bool has_pads, bool is_nchw> @@ -280,7 +360,8 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window Iterator in(src, window_in_out); Iterator out(dst, window_in_out); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop( + window, [&](const Coordinates & id) { const int start_w = id[width_idx] * stride_x - pad_left; const int start_h = id[height_idx] * stride_y - pad_top; @@ -311,31 +392,53 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window } else { - linearize_volume_nhwc<T, has_pads>(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_w, - input_h, - input_c, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); + if(_input_pad_right > 0) + { + linearize_volume_nhwc<T, has_pads>(input_ptr, + output_ptr, + _has_bias, + start_w, + start_h, + _kernel_width, + _kernel_height, + input_w, + input_h, + input_c, + input_stride_y, + input_stride_z, + pad_value, + _dilation.x(), + _dilation.y(), + _input_pad_right); + } + else + { + linearize_volume_nhwc<T, has_pads>(input_ptr, + output_ptr, + _has_bias, + start_w, + start_h, + _kernel_width, + _kernel_height, + input_w, + input_h, + input_c, + input_stride_y, + input_stride_z, + pad_value, + _dilation.x(), + _dilation.y()); + } } }, in, out); } void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups) + bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); ARM_COMPUTE_UNUSED(num_groups); _data_layout = src->data_layout(); @@ -343,14 +446,15 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - _conv_info = conv_info; - _kernel_width = kernel_dims.width; - _kernel_height = kernel_dims.height; - _dilation = dilation; - _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), - _kernel_width, _kernel_height, - _conv_info, _dilation); - _has_bias = has_bias; + _conv_info = conv_info; + _kernel_width = kernel_dims.width; + _kernel_height = kernel_dims.height; + _input_pad_right = input_pad_right; + _dilation = dilation; + _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), + _kernel_width, _kernel_height, + _conv_info, _dilation); + _has_bias = has_bias; if(_data_layout == DataLayout::NCHW) { @@ -408,7 +512,7 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const } // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false))); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right))); std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height, @@ -423,9 +527,9 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const } Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups) + bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); return Status{}; } @@ -437,8 +541,10 @@ void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const T auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); + (this->*_func)(src, dst, window); } + const char *CpuIm2ColKernel::name() const { return "CpuIm2ColKernel"; diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h index 8160310da6..d133f8dc2d 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.h +++ b/src/cpu/kernels/CpuIm2ColKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -66,19 +66,20 @@ public: ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel); /** Set the input and output of the kernel. * - * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32 - * Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false - * @param[out] dst The output tensor info. Data types supported: Same as @p input - * @param[in] kernel_dims The kernel dimensions (width and height). - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] has_bias In case biases are provided expands the matrix with 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32 + * Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false + * @param[out] dst The output tensor info. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary */ void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1); + bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuIm2ColKernel::configure() @@ -86,10 +87,10 @@ public: * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1); + bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Return minimum workload size of the relevant kernel * @@ -116,14 +117,15 @@ private: */ using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window); - Im2ColFunctionPtr _func{ nullptr }; + Im2ColFunctionPtr _func{ nullptr }; std::pair<unsigned int, unsigned int> _convolved_dims{}; - PadStrideInfo _conv_info{}; - unsigned int _kernel_width{ 0 }; - unsigned int _kernel_height{ 0 }; - bool _has_bias{ false }; - Size2D _dilation{ 1U, 1U }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; + PadStrideInfo _conv_info{}; + unsigned int _kernel_width{ 0 }; + unsigned int _kernel_height{ 0 }; + unsigned int _input_pad_right{ 0 }; + bool _has_bias{ false }; + Size2D _dilation{ 1U, 1U }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; }; } // namespace kernels } // namespace cpu |