From 4e2bbbbb23e6f4bd452f7f865e51228e1f51efec Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Mon, 9 Jan 2023 17:21:01 +0000 Subject: Add support for dilation > 1 in assembly DepthwiseConvolution * Resolve COMPMID-5689 Change-Id: I81a3791ad054db59562b76d1c729f2b2168aee8b Signed-off-by: Pablo Marquez Tello Signed-off-by: Andrew Mundy Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8919 Reviewed-by: Jakub Sujak Reviewed-by: Viet-Hoa Do Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- Android.bp | 1 + filelist.json | 1 + .../arm_conv/depthwise/depthfirst_driver.hpp | 46 +++--- .../arm_conv/depthwise/depthwise_common.cpp | 65 ++++++++ .../arm_conv/depthwise/depthwise_depthfirst.hpp | 59 ++++---- .../depthwise/depthwise_depthfirst_generic.hpp | 21 +-- .../depthwise/depthwise_depthfirst_multiplier.hpp | 27 ++-- .../depthwise/depthwise_implementation.hpp | 13 +- .../arm_conv/depthwise/depthwise_planar.hpp | 26 ++-- src/core/NEON/kernels/assembly/depthwise.hpp | 168 +++++++++++++++------ .../NEON/kernels/assembly/depthwise_common.hpp | 27 +++- .../CpuDepthwiseConv2dAssemblyWrapperKernel.cpp | 13 +- 12 files changed, 320 insertions(+), 147 deletions(-) create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp diff --git a/Android.bp b/Android.bp index 3af432218b..50dc160abe 100644 --- a/Android.bp +++ b/Android.bp @@ -301,6 +301,7 @@ cc_library_static { "src/core/NEON/kernels/NEStridedSliceKernel.cpp", "src/core/NEON/kernels/NETileKernel.cpp", "src/core/NEON/kernels/arm_conv/addressing.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", diff --git a/filelist.json b/filelist.json index ea75c4ada1..1d72ddf280 100644 --- a/filelist.json +++ b/filelist.json @@ -1203,6 +1203,7 @@ "src/core/NEON/kernels/convolution/common/qsymm8.cpp", "src/core/NEON/kernels/convolution/common/utils.cpp", "src/core/NEON/kernels/arm_conv/addressing.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp index e02998f5a0..c305835107 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,6 +79,7 @@ class DepthfirstDriver : public DepthwiseCommon /* Compute a portion of the output tensor with padding. */ virtual void compute_tile_padded( + const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, unsigned int output_channel_start, unsigned int output_channel_end, const TensorSpec &input, @@ -93,6 +94,7 @@ class DepthfirstDriver : public DepthwiseCommon * variant. */ virtual void compute_row_padded_tile_row( + const DepthwiseArgs &args, const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, const unsigned int output_channel_start, const unsigned int output_channel_end, const TensorSpec &input, @@ -104,6 +106,7 @@ class DepthfirstDriver : public DepthwiseCommon for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols()) { this->compute_tile_padded( + args, output_i, output_j, output_channel_start, output_channel_end, input, output, parameters, working_space ); @@ -116,6 +119,7 @@ class DepthfirstDriver : public DepthwiseCommon * variant. */ virtual void compute_tiles_unpadded( + const DepthwiseArgs &args, unsigned int start_output_i, unsigned int start_output_j, unsigned int n_tile_rows, unsigned int n_tile_cols, unsigned int output_channel_start, unsigned int output_channel_end, @@ -131,6 +135,7 @@ class DepthfirstDriver : public DepthwiseCommon for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) { this->compute_tile_padded( + args, start_output_i, row_start_output_j, output_channel_start, output_channel_end, input, output, parameters, working_space @@ -142,18 +147,12 @@ class DepthfirstDriver : public DepthwiseCommon } void execute_internal( - unsigned int n_batches, - unsigned int input_height, - unsigned int input_width, - unsigned int n_input_channels, - const PaddingValues &padding, + const DepthwiseArgs &args, const void *input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const void *parameters, - unsigned int output_height, - unsigned int output_width, void *output, size_t ld_output_col, size_t ld_output_row, @@ -165,40 +164,40 @@ class DepthfirstDriver : public DepthwiseCommon { // Get and initialise the working space for this thread. void *thread_working_space = - static_cast(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels); - this->initialise_working_space(thread_working_space, n_input_channels); + static_cast(working_space) + thread_id * this->get_working_size_per_thread(args.input_channels); + this->initialise_working_space(thread_working_space, args.input_channels); // Construct convenient representations of the input/output tensors. TensorSpec input_tensor(reinterpret_cast(input), ld_input_row, ld_input_col); TensorSpec output_tensor(reinterpret_cast(output), ld_output_row, ld_output_col); - const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier; + const auto n_output_channels = args.input_channels * args.channel_multiplier; - for (unsigned int batch = 0; batch < n_batches; batch++) + for (unsigned int batch = 0; batch < args.n_batches; batch++) { // Iterate over rows of the output tensor; we stripe over the tiles. for (unsigned int start_output_i = thread_id * m_strat->get_output_rows(); - start_output_i < output_height; + start_output_i < args.output_rows; start_output_i += n_threads * m_strat->get_output_rows()) { // Determine what (if any padding) is required on the top/bottom of // this row of the convolution. const auto end_output_i = start_output_i + m_strat->get_output_rows(); - const bool pad_output_bottom = output_height < end_output_i; + const bool pad_output_bottom = args.output_rows < end_output_i; - const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top; + const int start_input_i = start_output_i * args.stride_rows - args.padding.top; const bool pad_input_top = start_input_i < 0; const int end_input_i = start_input_i + m_strat->get_input_rows(); - const bool pad_input_bottom = static_cast(input_height) < end_input_i; + const bool pad_input_bottom = static_cast(args.input_rows) < end_input_i; const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom; // Iterate over the columns of the output tensor; we attempt to grab as // much as possible of the unpadded regions, so the loop structure is a // bit odd. unsigned int start_output_j = 0; - while (start_output_j < output_width) + while (start_output_j < args.output_cols) { - const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left; + const int start_in_j = start_output_j * args.stride_cols - args.padding.left; const bool pad_input_left = start_in_j < 0; // Determine if we can process a number of unpadded tiles in one go. @@ -206,16 +205,16 @@ class DepthfirstDriver : public DepthwiseCommon if (!pad_input_left) { // Determine the maximum number of tiles we could handle. - n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols(); + n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols(); // Handle padding on the right hand edge - const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols; + const int tile_stride = m_strat->get_output_cols() * args.stride_cols; int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols(); int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride; while (n_unpadded_tiles > 0 && - (static_cast(output_width) < end_output_j || - static_cast(input_width) < end_input_j)) + (static_cast(args.output_cols) < end_output_j || + static_cast(args.input_cols) < end_input_j)) { n_unpadded_tiles--; end_output_j -= m_strat->get_output_cols(); @@ -230,6 +229,7 @@ class DepthfirstDriver : public DepthwiseCommon { // Completely unpadded execution this->compute_tiles_unpadded( + args, start_output_i, start_output_j, 1, n_unpadded_tiles, // Compute a row of unpadded tiles 0, n_output_channels, // Compute all channels @@ -240,6 +240,7 @@ class DepthfirstDriver : public DepthwiseCommon { // Top/bottom padding only this->compute_row_padded_tile_row( + args, start_output_i, start_output_j, n_unpadded_tiles, 0, n_output_channels, // Compute all channels input_tensor, output_tensor, parameters, thread_working_space @@ -250,6 +251,7 @@ class DepthfirstDriver : public DepthwiseCommon else { this->compute_tile_padded( + args, start_output_i, start_output_j, 0, n_output_channels, // Compute all channels input_tensor, output_tensor, parameters, thread_working_space diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp new file mode 100644 index 0000000000..c2b861000c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "depthwise_common.hpp" + +using arm_gemm::iceildiv; + +namespace arm_conv { +namespace depthwise { + +std::tuple +get_reduced_view_for_dilation(size_t out_size, size_t in_size, const size_t d, + const size_t dilation_factor, + const size_t kernel_size, const size_t stride, + const size_t orig_pad_before) { + // Get the valid output range + out_size = iceildiv(out_size - d, dilation_factor); + + // Compute the start offset and the amount of padding which applies to this + // portion of the work. + size_t start_pos = d * stride, pad_before = 0; + if (start_pos < orig_pad_before) { + pad_before = iceildiv(orig_pad_before - start_pos, dilation_factor); + } + start_pos += pad_before * dilation_factor - orig_pad_before; + + // Hence compute the valid input range + in_size = start_pos < in_size + ? iceildiv(in_size - start_pos, dilation_factor) + : 0; + + // Finally, compute the "after" padding + const size_t reqd_input = (out_size - 1) * stride + kernel_size; + size_t pad_after = 0; + if (reqd_input > (pad_before + in_size)) { + pad_after = reqd_input - (pad_before + in_size); + } + + return std::make_tuple(out_size, in_size, start_pos, pad_before, pad_after); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp index 70b12919b0..2620b48e17 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -367,6 +367,7 @@ class DepthwiseDepthfirst protected: void compute_tile_padded( + const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, unsigned int output_channel_start, unsigned int output_channel_end, const TensorSpec &input, @@ -379,13 +380,13 @@ class DepthwiseDepthfirst auto ws = reinterpret_cast(working_space_raw); // Compute the input pointer array - const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + const auto input_channel_start = output_channel_start / args.channel_multiplier; - const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const int ii = static_cast(output_i * args.stride_rows) - args.padding.top; const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); const auto input_i = static_cast(ii < 0 ? 0 : ii); - const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const int ij = static_cast(output_j * args.stride_cols) - args.padding.left; const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); const auto input_j = static_cast(ij < 0 ? 0 : ij); @@ -394,8 +395,8 @@ class DepthwiseDepthfirst input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, input.ld_row, input.ld_col, ws->input_buffer, - input_pad_top, this->m_args.input_rows - input_i, - input_pad_left, this->m_args.input_cols - input_j + input_pad_top, args.input_rows - input_i, + input_pad_left, args.input_cols - input_j ); // Compute the output pointer array @@ -404,8 +405,8 @@ class DepthwiseDepthfirst output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, output.ld_row, output.ld_col, ws->output_buffer, - 0, this->m_args.output_rows - output_i, // Top padding, # valid rows - 0, this->m_args.output_cols - output_j // Left padding, # valid columns + 0, args.output_rows - output_i, // Top padding, # valid rows + 0, args.output_cols - output_j // Left padding, # valid columns ); // Execute the kernel @@ -416,6 +417,7 @@ class DepthwiseDepthfirst } void compute_row_padded_tile_row( + const DepthwiseArgs &args, const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, const unsigned int output_channel_start, const unsigned int output_channel_end, const TensorSpec &input, @@ -430,19 +432,19 @@ class DepthwiseDepthfirst const auto os = this->get_output_stage(); // Compute top and bottom padding; hence fill in the initial pointer arrays. - const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; - const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const auto input_channel_start = output_channel_start / args.channel_multiplier; + const int ii = static_cast(output_i * args.stride_rows) - args.padding.top; const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); const auto input_i = static_cast(ii < 0 ? 0 : ii); - const auto input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left; + const auto input_j = output_j * args.stride_cols - args.padding.left; // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows // available. - const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, this->m_args.input_rows - input_i); - const auto valid_output_rows = std::min(strat->get_output_rows(), this->m_args.output_rows - output_i); + const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i); + const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i); - const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols; + const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols; const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols(); fill_pointer_array( @@ -450,8 +452,8 @@ class DepthwiseDepthfirst input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, input.ld_row, input.ld_col, ws->input_buffer, - input_pad_top, this->m_args.input_rows - input_i, - 0, this->m_args.input_cols - input_j // No left padding + input_pad_top, args.input_rows - input_i, + 0, args.input_cols - input_j // No left padding ); fill_pointer_array( @@ -459,8 +461,8 @@ class DepthwiseDepthfirst output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, output.ld_row, output.ld_col, ws->output_buffer, - 0, this->m_args.output_rows - output_i, // Top padding, # valid rows - 0, this->m_args.output_cols - output_j // Left padding, # valid columns + 0, args.output_rows - output_i, // Top padding, # valid rows + 0, args.output_cols - output_j // Left padding, # valid columns ); for (; n_tile_cols; n_tile_cols--) @@ -492,6 +494,7 @@ class DepthwiseDepthfirst } void compute_tiles_unpadded( + const DepthwiseArgs &args, unsigned int output_i, const unsigned int output_j, unsigned int n_tile_rows, unsigned int n_tile_cols, unsigned int output_channel_start, unsigned int output_channel_end, @@ -511,8 +514,8 @@ class DepthwiseDepthfirst // If the direct kernel is supported, then use it. // Compute the base pointers we'll use in the tile. auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col; - const int start_input_i = output_i * this->m_args.stride_rows - this->m_args.padding.top; - const int start_input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left; + const int start_input_i = output_i * args.stride_rows - args.padding.top; + const int start_input_j = output_j * args.stride_cols - args.padding.left; auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col; // Execute the kernel @@ -528,10 +531,10 @@ class DepthwiseDepthfirst { // Otherwise, we repeatedly call the padded kernel but use our knowledge // of the tensor structure to avoid recomputing the pointer array. - const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + const auto input_channel_start = output_channel_start / args.channel_multiplier; const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols(); - const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols; + const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols; const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols(); @@ -539,16 +542,16 @@ class DepthwiseDepthfirst // each subsequent tile we simply update the pointers. for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++) { - const int input_i = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; - const int input_j = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const int input_i = static_cast(output_i * args.stride_rows) - args.padding.top; + const int input_j = static_cast(output_j * args.stride_cols) - args.padding.left; fill_pointer_array( ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, input.ld_row, input.ld_col, ws->input_buffer, - 0, this->m_args.input_rows, - 0, this->m_args.input_cols + 0, args.input_rows, + 0, args.input_cols ); // Compute the output pointer array @@ -557,8 +560,8 @@ class DepthwiseDepthfirst output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, output.ld_row, output.ld_col, ws->output_buffer, - 0, this->m_args.output_rows, - 0, this->m_args.output_cols + 0, args.output_rows, + 0, args.output_cols ); for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp index 9f53f7cc6f..b058ce26f2 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -248,6 +248,7 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon &input, @@ -259,24 +260,24 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon(working_space_raw); - const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const int ii = static_cast(output_i * args.stride_rows) - args.padding.top; const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); const auto input_i = static_cast(ii < 0 ? 0 : ii); - const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const int ij = static_cast(output_j * args.stride_cols) - args.padding.left; const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); const auto input_j = static_cast(ij < 0 ? 0 : ij); fill_pointer_array_generic_kernel( ws->inptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), - this->m_args.kernel_rows, this->m_args.kernel_cols, - this->m_args.stride_rows, this->m_args.stride_cols, + args.kernel_rows, args.kernel_cols, + args.stride_rows, args.stride_cols, input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start, input.ld_row, input.ld_col, ws->input_buffer, - input_pad_top, this->m_args.input_rows - input_i, - input_pad_left, this->m_args.input_cols - input_j + input_pad_top, args.input_rows - input_i, + input_pad_left, args.input_cols - input_j ); // Compute the output pointer array @@ -285,15 +286,15 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommonoutput_buffer, - 0, this->m_args.output_rows - output_i, // Top padding, # valid rows - 0, this->m_args.output_cols - output_j // Left padding, # valid columns + 0, args.output_rows - output_i, // Top padding, # valid rows + 0, args.output_cols - output_j // Left padding, # valid columns ); // Execute the kernel DepthwiseDepthfirstGenericKernelCall::execute( reinterpret_cast(this->m_strat.get()), ws, this->get_output_stage(), m_bias, parameters, - this->m_args.kernel_rows * this->m_args.kernel_cols, + args.kernel_rows * args.kernel_cols, channel_end - channel_start ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp index e58467b0f4..cef568fadd 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -525,6 +525,7 @@ class DepthwiseDepthfirstMultiplier : public DepthfirstDriver &input, @@ -536,11 +537,11 @@ class DepthwiseDepthfirstMultiplier : public DepthfirstDriver(working_space_raw); - const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const int ii = static_cast(output_i * args.stride_rows) - args.padding.top; const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); const auto input_i = static_cast(ii < 0 ? 0 : ii); - const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const int ij = static_cast(output_j * args.stride_cols) - args.padding.left; const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); const auto input_j = static_cast(ij < 0 ? 0 : ij); @@ -551,40 +552,40 @@ class DepthwiseDepthfirstMultiplier : public DepthfirstDriveroutput_buffer, - 0, this->m_args.output_rows - output_i, // Top padding, # valid rows - 0, this->m_args.output_cols - output_j // Left padding, # valid columns + 0, args.output_rows - output_i, // Top padding, # valid rows + 0, args.output_cols - output_j // Left padding, # valid columns ); // Compute the parameter stride - DepthwiseArgs single_iter(this->m_args); + DepthwiseArgs single_iter(args); single_iter.input_channels = 1; const size_t parameter_stride = reinterpret_cast(this->m_strat.get()) ->get_storage_size(single_iter); for (; output_channel_start < output_channel_end; - output_channel_start += this->m_args.channel_multiplier) + output_channel_start += args.channel_multiplier) { // Compute the input pointer array - const auto input_channel = output_channel_start / this->m_args.channel_multiplier; + const auto input_channel = output_channel_start / args.channel_multiplier; // Construct the input patch depthfirst_multiplier::PrepareInputSample::execute( - this->m_args, ws, this->m_strat.get(), + args, ws, this->m_strat.get(), input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col, - input_pad_top, this->m_args.input_rows - input_i, - input_pad_left, this->m_args.input_cols - input_j + input_pad_top, args.input_rows - input_i, + input_pad_left, args.input_cols - input_j ); // Execute the kernel depthfirst_multiplier::StrategyType::execute( - this->m_args, ws, reinterpret_cast(this->m_strat.get()), m_os, output_channel_start, + args, ws, reinterpret_cast(this->m_strat.get()), m_os, output_channel_start, parameters, m_bias ); // Update the output pointers for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++) { - ws->outptr_array[n] += this->m_args.channel_multiplier; + ws->outptr_array[n] += args.channel_multiplier; } // Progress the parameters diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp index 1ee19e5075..0f91fe363c 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp @@ -55,7 +55,9 @@ struct DepthwiseImplementation DepthwiseCommon *get_instance(const DepthwiseArgs &args, const OutputStage &os) const { - return initialise(args, os); + auto impl = initialise(args, os); + impl->set_name(std::string(name)); + return impl; } }; @@ -136,14 +138,7 @@ UniqueDepthwiseCommon depthwise(const DepthwiseArgs &a { const DepthwiseImplementation *impl = nullptr; const bool success = find_implementation(args, os, impl); - - if(success) - { - auto i = impl->get_instance(args, os); - i->set_name(impl->name); - return UniqueDepthwiseCommon(i); - } - return nullptr; + return UniqueDepthwiseCommon(success ? impl->get_instance(args, os) : nullptr); } } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp index f3160fba27..2b2e6f3555 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -332,18 +332,12 @@ class DepthwisePlanar : public DepthwiseCommon } void execute_internal( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int n_input_channels, - const PaddingValues &padding, + const DepthwiseArgs &args, const void *input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const void *parameters, - unsigned int output_height, - unsigned int output_width, void *output, size_t ld_output_col, size_t ld_output_row, @@ -359,7 +353,7 @@ class DepthwisePlanar : public DepthwiseCommon this->initialise_working_space(thread_working_space); auto ws = reinterpret_cast(thread_working_space); - const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier; + const auto n_output_channels = args.input_channels * args.channel_multiplier; const auto vl = get_vector_length(m_strat->get_vl_type()); // Get typed pointers @@ -368,23 +362,23 @@ class DepthwisePlanar : public DepthwiseCommon auto weights = reinterpret_cast(parameters); // Iterate over batches - for (; batches; batches--) + for (auto batches = args.n_batches; batches; batches--) { // NOTE: Other loop orderings are possible and it would be worth // investigating them. // Within a batch, stripe threads across rows. for (auto start_output_i = thread_id * m_strat->get_output_rows(); - start_output_i < output_height; + start_output_i < args.output_rows; start_output_i += n_threads * m_strat->get_output_rows()) { // Determine what (if any padding) is required on the top/bottom of // this row of the convolution. - const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top; + const int start_input_i = start_output_i * args.stride_rows - args.padding.top; const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0; const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i; - const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i; - const unsigned int valid_output_rows = output_height - start_output_i; + const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i; + const unsigned int valid_output_rows = args.output_rows - start_output_i; auto inptr_row = input_batch + input_i*ld_input_row; auto outptr_row = output_batch + start_output_i * ld_output_row; @@ -392,10 +386,10 @@ class DepthwisePlanar : public DepthwiseCommon // Execute the kernel this->execute_kernel( inptr_row, ld_input_row, ld_input_col, vl, - input_pad_top, valid_input_rows, padding.left, input_width, + input_pad_top, valid_input_rows, args.padding.left, args.input_cols, weights, this->m_bias, outptr_row, ld_output_row, ld_output_col, vl, - valid_output_rows, output_width, + valid_output_rows, args.output_cols, 0 /* first channel */, n_output_channels, ws ); diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index 3998dfbc9a..8eb278c22e 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -38,8 +38,8 @@ struct DepthwiseConfig std::string filter = ""; DepthwiseConfig(DepthwiseMethod method) - : method(method) {}; - DepthwiseConfig() {}; + : method(method){}; + DepthwiseConfig(){}; }; struct DepthwiseArgs @@ -48,6 +48,7 @@ struct DepthwiseArgs unsigned int kernel_rows, kernel_cols; unsigned int stride_rows, stride_cols; + unsigned int dilation_rows, dilation_cols; unsigned int n_batches, input_rows, input_cols, input_channels; unsigned int output_rows, output_cols; @@ -61,6 +62,38 @@ struct DepthwiseArgs bool fast_mode = false; + DepthwiseArgs( + const CPUInfo *cpu_info, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + unsigned int dilation_rows, unsigned int dilation_cols, + unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, arm_gemm::Activation activation, + + const DepthwiseConfig *config) + : cpu_info(cpu_info), + kernel_rows(kernel_rows), + kernel_cols(kernel_cols), + stride_rows(stride_rows), + stride_cols(stride_cols), + dilation_rows(dilation_rows), + dilation_cols(dilation_cols), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + input_channels(input_channels), + output_rows(output_rows), + output_cols(output_cols), + channel_multiplier(channel_multiplier), + padding(padding), + activation(activation), + config(config) + { + } + DepthwiseArgs( const CPUInfo *cpu_info, unsigned int kernel_rows, unsigned int kernel_cols, @@ -71,8 +104,10 @@ struct DepthwiseArgs unsigned int channel_multiplier, PaddingValues padding, arm_gemm::Activation activation, const DepthwiseConfig *config) - : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), stride_rows(stride_rows), stride_cols(stride_cols), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), - input_channels(input_channels), output_rows(output_rows), output_cols(output_cols), channel_multiplier(channel_multiplier), padding(padding), activation(activation), config(config) + : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows, + stride_cols, 1, 1, n_batches, input_rows, input_cols, + input_channels, output_rows, output_cols, + channel_multiplier, padding, activation, config) { } }; @@ -80,28 +115,30 @@ struct DepthwiseArgs template class DepthwiseCommon : public IDepthwiseCommon { -private: - std::string _name{}; - -protected: + protected: const DepthwiseArgs m_args; // Copy of arguments + std::string m_name{}; -public: - std::string name() const + public: + DepthwiseCommon(const DepthwiseArgs &args) + : m_args(args){}; + DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon &operator=(DepthwiseCommon &) = delete; + + std::string name() const override { - return _name; + return m_name; } - void set_name(const std::string &n) + void set_name(std::string name) { - _name = n; + // Only allow the name to be set once + if (m_name.empty()) + { + m_name = name; + } } - DepthwiseCommon(const DepthwiseArgs &args) - : m_args(args) {}; - DepthwiseCommon(DepthwiseCommon &) = delete; - DepthwiseCommon &operator=(DepthwiseCommon &) = delete; - void execute( const void *const input, const void *const parameters, @@ -168,34 +205,77 @@ public: unsigned int thread_id, unsigned int n_threads) const override final { - this->execute_internal( - batches, input_height, input_width, channels, padding, input, - ld_input_col, ld_input_row, ld_input_batch, parameters, output_height, - output_width, output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + // Construct a new set of arguments to reflect that we might have been + // passed different input/output tensors. Dilation is handled at this + // level; so we set the dilation in the arguments to zero. + DepthwiseArgs args(this->m_args); + args.n_batches = batches; + args.input_rows = input_height; + args.input_cols = input_width; + args.input_channels = channels; + args.output_rows = output_height; + args.output_cols = output_width; + args.padding = padding; + args.dilation_rows = args.dilation_cols = 1; + + auto ld_input_col_d = ld_input_col * m_args.dilation_cols; + auto ld_input_row_d = ld_input_row * m_args.dilation_rows; + auto ld_output_col_d = ld_output_col * m_args.dilation_cols; + auto ld_output_row_d = ld_output_row * m_args.dilation_rows; + + for (size_t drow = 0; drow < m_args.dilation_rows; drow++) + { + size_t start_i; + std::tie(args.output_rows, args.input_rows, start_i, + args.padding.top, args.padding.bottom) = + get_reduced_view_for_dilation( + output_height, input_height, drow, m_args.dilation_rows, + m_args.kernel_rows, m_args.stride_rows, padding.top); + + auto input_row = static_cast(input) + start_i * ld_input_row; + auto output_row = static_cast(output) + drow * ld_output_row; + + if (args.output_rows) + { + for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) + { + size_t start_j; + std::tie(args.output_cols, args.input_cols, start_j, + args.padding.left, args.padding.right) = + get_reduced_view_for_dilation( + output_width, input_width, dcol, m_args.dilation_cols, + m_args.kernel_cols, m_args.stride_cols, padding.left); + + const TInput *input_col = input_row + start_j * ld_input_col; + TOutput *output_col = output_row + dcol * ld_output_col; + + if (args.output_cols) + { + this->execute_internal( + args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters, + output_col, ld_output_col_d, ld_output_row_d, ld_output_batch, + working_space, thread_id, n_threads); + } + } + } + } } -protected: + protected: virtual void execute_internal( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + const DepthwiseArgs &instance_args, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; template diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp index 52963ab357..fea6326897 100644 --- a/src/core/NEON/kernels/assembly/depthwise_common.hpp +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include "arm_gemm.hpp" #include "common.hpp" +#include +#include namespace arm_conv { @@ -64,6 +66,9 @@ class IDepthwiseCommon public: virtual ~IDepthwiseCommon() = default; + // Get the name of the depthwise implementation + virtual std::string name() const = 0; + // Determine the amount of storage space required for the rearranged weights // and bias. virtual size_t get_storage_size(void) const = 0; @@ -127,5 +132,25 @@ public: unsigned int n_threads) const = 0; }; +// To handle a dilation factor of D execute the kernel once for each d in +// [0..D). Each `d` corresponds to a portion or "view" of the input and output +// tensors. The output view corresponds to every Dth pixel starting from `d`; +// this function computes how many pixels are covered. The input view consists +// of an amount of before padding, every Dth pixel starting from an offset, and +// some after padding. This function computes the start padding, input offset, +// number of valid input pixels, and the after padding. +// +// Returns +// - Number of valid output pixels corresponding to `d` +// - Number of valid input pixels corresponding to `d` +// - Offset of the first pixel corresponding to `d` +// - Amount of padding in the view for `d` +std::tuple +get_reduced_view_for_dilation( + size_t out_size, size_t in_size, + size_t d, size_t dilation_factor, + size_t kernel_size, size_t stride, + size_t pad_before); + } // namespace depthwise } // namespace arm_conv diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp index 73bf7dcb8a..5360abf5ac 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,6 +62,9 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI unsigned int stride_rows{}; std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + unsigned int dilation_cols = info.dilation.x(); + unsigned int dilation_rows = info.dilation.y(); + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); const unsigned int n_batches = src->dimension(idx_batches); @@ -76,7 +79,7 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); @@ -103,6 +106,9 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT unsigned int stride_rows{}; std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + unsigned int dilation_cols = info.dilation.x(); + unsigned int dilation_rows = info.dilation.y(); + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); const unsigned int n_batches = src->dimension(idx_batches); @@ -117,7 +123,7 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); @@ -265,7 +271,6 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)"); if(is_data_type_quantized_per_channel(weights->data_type())) { -- cgit v1.2.1