diff options
author | Pablo Marquez Tello <pablo.tello@arm.com> | 2023-01-09 17:21:01 +0000 |
---|---|---|
committer | Pablo Marquez Tello <pablo.tello@arm.com> | 2023-02-08 11:05:08 +0000 |
commit | 4e2bbbbb23e6f4bd452f7f865e51228e1f51efec (patch) | |
tree | 36469f45f17d94f13bc1206e3a5975ba6cbccad5 /src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp | |
parent | fbe94da93b5be8745727ba7624b3d011e2bfa383 (diff) | |
download | ComputeLibrary-4e2bbbbb23e6f4bd452f7f865e51228e1f51efec.tar.gz |
Add support for dilation > 1 in assembly DepthwiseConvolution
* Resolve COMPMID-5689
Change-Id: I81a3791ad054db59562b76d1c729f2b2168aee8b
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Signed-off-by: Andrew Mundy <andrew.mundy@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8919
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp | 46 |
1 files changed, 24 insertions, 22 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp index e02998f5a0..c305835107 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,6 +79,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> /* Compute a portion of the output tensor with padding. */ virtual void compute_tile_padded( + const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, unsigned int output_channel_start, unsigned int output_channel_end, const TensorSpec<const TInput *> &input, @@ -93,6 +94,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> * variant. */ virtual void compute_row_padded_tile_row( + const DepthwiseArgs &args, const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, const unsigned int output_channel_start, const unsigned int output_channel_end, const TensorSpec<const TInput *> &input, @@ -104,6 +106,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols()) { this->compute_tile_padded( + args, output_i, output_j, output_channel_start, output_channel_end, input, output, parameters, working_space ); @@ -116,6 +119,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> * variant. */ virtual void compute_tiles_unpadded( + const DepthwiseArgs &args, unsigned int start_output_i, unsigned int start_output_j, unsigned int n_tile_rows, unsigned int n_tile_cols, unsigned int output_channel_start, unsigned int output_channel_end, @@ -131,6 +135,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) { this->compute_tile_padded( + args, start_output_i, row_start_output_j, output_channel_start, output_channel_end, input, output, parameters, working_space @@ -142,18 +147,12 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> } void execute_internal( - unsigned int n_batches, - unsigned int input_height, - unsigned int input_width, - unsigned int n_input_channels, - const PaddingValues &padding, + const DepthwiseArgs &args, const void *input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const void *parameters, - unsigned int output_height, - unsigned int output_width, void *output, size_t ld_output_col, size_t ld_output_row, @@ -165,40 +164,40 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> { // Get and initialise the working space for this thread. void *thread_working_space = - static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels); - this->initialise_working_space(thread_working_space, n_input_channels); + static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(args.input_channels); + this->initialise_working_space(thread_working_space, args.input_channels); // Construct convenient representations of the input/output tensors. TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col); TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col); - const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier; + const auto n_output_channels = args.input_channels * args.channel_multiplier; - for (unsigned int batch = 0; batch < n_batches; batch++) + for (unsigned int batch = 0; batch < args.n_batches; batch++) { // Iterate over rows of the output tensor; we stripe over the tiles. for (unsigned int start_output_i = thread_id * m_strat->get_output_rows(); - start_output_i < output_height; + start_output_i < args.output_rows; start_output_i += n_threads * m_strat->get_output_rows()) { // Determine what (if any padding) is required on the top/bottom of // this row of the convolution. const auto end_output_i = start_output_i + m_strat->get_output_rows(); - const bool pad_output_bottom = output_height < end_output_i; + const bool pad_output_bottom = args.output_rows < end_output_i; - const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top; + const int start_input_i = start_output_i * args.stride_rows - args.padding.top; const bool pad_input_top = start_input_i < 0; const int end_input_i = start_input_i + m_strat->get_input_rows(); - const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i; + const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i; const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom; // Iterate over the columns of the output tensor; we attempt to grab as // much as possible of the unpadded regions, so the loop structure is a // bit odd. unsigned int start_output_j = 0; - while (start_output_j < output_width) + while (start_output_j < args.output_cols) { - const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left; + const int start_in_j = start_output_j * args.stride_cols - args.padding.left; const bool pad_input_left = start_in_j < 0; // Determine if we can process a number of unpadded tiles in one go. @@ -206,16 +205,16 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> if (!pad_input_left) { // Determine the maximum number of tiles we could handle. - n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols(); + n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols(); // Handle padding on the right hand edge - const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols; + const int tile_stride = m_strat->get_output_cols() * args.stride_cols; int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols(); int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride; while (n_unpadded_tiles > 0 && - (static_cast<int>(output_width) < end_output_j || - static_cast<int>(input_width) < end_input_j)) + (static_cast<int>(args.output_cols) < end_output_j || + static_cast<int>(args.input_cols) < end_input_j)) { n_unpadded_tiles--; end_output_j -= m_strat->get_output_cols(); @@ -230,6 +229,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> { // Completely unpadded execution this->compute_tiles_unpadded( + args, start_output_i, start_output_j, 1, n_unpadded_tiles, // Compute a row of unpadded tiles 0, n_output_channels, // Compute all channels @@ -240,6 +240,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> { // Top/bottom padding only this->compute_row_padded_tile_row( + args, start_output_i, start_output_j, n_unpadded_tiles, 0, n_output_channels, // Compute all channels input_tensor, output_tensor, parameters, thread_working_space @@ -250,6 +251,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> else { this->compute_tile_padded( + args, start_output_i, start_output_j, 0, n_output_channels, // Compute all channels input_tensor, output_tensor, parameters, thread_working_space |