aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorPablo Marquez Tello <pablo.tello@arm.com>2023-01-09 17:21:01 +0000
committerPablo Marquez Tello <pablo.tello@arm.com>2023-02-08 11:05:08 +0000
commit4e2bbbbb23e6f4bd452f7f865e51228e1f51efec (patch)
tree36469f45f17d94f13bc1206e3a5975ba6cbccad5 /src/core/NEON/kernels
parentfbe94da93b5be8745727ba7624b3d011e2bfa383 (diff)
downloadComputeLibrary-4e2bbbbb23e6f4bd452f7f865e51228e1f51efec.tar.gz
Add support for dilation > 1 in assembly DepthwiseConvolution
* Resolve COMPMID-5689 Change-Id: I81a3791ad054db59562b76d1c729f2b2168aee8b Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Signed-off-by: Andrew Mundy <andrew.mundy@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8919 Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp65
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp59
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp21
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp27
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp26
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp168
-rw-r--r--src/core/NEON/kernels/assembly/depthwise_common.hpp27
9 files changed, 309 insertions, 143 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index e02998f5a0..c305835107 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,6 +79,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
/* Compute a portion of the output tensor with padding. */
virtual void compute_tile_padded(
+ const DepthwiseArgs &args,
unsigned int output_i, unsigned int output_j,
unsigned int output_channel_start, unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
@@ -93,6 +94,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
* variant.
*/
virtual void compute_row_padded_tile_row(
+ const DepthwiseArgs &args,
const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
const unsigned int output_channel_start, const unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
@@ -104,6 +106,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
{
this->compute_tile_padded(
+ args,
output_i, output_j, output_channel_start, output_channel_end,
input, output, parameters, working_space
);
@@ -116,6 +119,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
* variant.
*/
virtual void compute_tiles_unpadded(
+ const DepthwiseArgs &args,
unsigned int start_output_i, unsigned int start_output_j,
unsigned int n_tile_rows, unsigned int n_tile_cols,
unsigned int output_channel_start, unsigned int output_channel_end,
@@ -131,6 +135,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
{
this->compute_tile_padded(
+ args,
start_output_i, row_start_output_j,
output_channel_start, output_channel_end,
input, output, parameters, working_space
@@ -142,18 +147,12 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
}
void execute_internal(
- unsigned int n_batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int n_input_channels,
- const PaddingValues &padding,
+ const DepthwiseArgs &args,
const void *input,
size_t ld_input_col,
size_t ld_input_row,
size_t ld_input_batch,
const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
void *output,
size_t ld_output_col,
size_t ld_output_row,
@@ -165,40 +164,40 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
{
// Get and initialise the working space for this thread.
void *thread_working_space =
- static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels);
- this->initialise_working_space(thread_working_space, n_input_channels);
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(args.input_channels);
+ this->initialise_working_space(thread_working_space, args.input_channels);
// Construct convenient representations of the input/output tensors.
TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
- const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+ const auto n_output_channels = args.input_channels * args.channel_multiplier;
- for (unsigned int batch = 0; batch < n_batches; batch++)
+ for (unsigned int batch = 0; batch < args.n_batches; batch++)
{
// Iterate over rows of the output tensor; we stripe over the tiles.
for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
- start_output_i < output_height;
+ start_output_i < args.output_rows;
start_output_i += n_threads * m_strat->get_output_rows())
{
// Determine what (if any padding) is required on the top/bottom of
// this row of the convolution.
const auto end_output_i = start_output_i + m_strat->get_output_rows();
- const bool pad_output_bottom = output_height < end_output_i;
+ const bool pad_output_bottom = args.output_rows < end_output_i;
- const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+ const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
const bool pad_input_top = start_input_i < 0;
const int end_input_i = start_input_i + m_strat->get_input_rows();
- const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
+ const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
// Iterate over the columns of the output tensor; we attempt to grab as
// much as possible of the unpadded regions, so the loop structure is a
// bit odd.
unsigned int start_output_j = 0;
- while (start_output_j < output_width)
+ while (start_output_j < args.output_cols)
{
- const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left;
+ const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
const bool pad_input_left = start_in_j < 0;
// Determine if we can process a number of unpadded tiles in one go.
@@ -206,16 +205,16 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
if (!pad_input_left)
{
// Determine the maximum number of tiles we could handle.
- n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
+ n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
// Handle padding on the right hand edge
- const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols;
+ const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
while (n_unpadded_tiles > 0 &&
- (static_cast<int>(output_width) < end_output_j ||
- static_cast<int>(input_width) < end_input_j))
+ (static_cast<int>(args.output_cols) < end_output_j ||
+ static_cast<int>(args.input_cols) < end_input_j))
{
n_unpadded_tiles--;
end_output_j -= m_strat->get_output_cols();
@@ -230,6 +229,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
{
// Completely unpadded execution
this->compute_tiles_unpadded(
+ args,
start_output_i, start_output_j,
1, n_unpadded_tiles, // Compute a row of unpadded tiles
0, n_output_channels, // Compute all channels
@@ -240,6 +240,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
{
// Top/bottom padding only
this->compute_row_padded_tile_row(
+ args,
start_output_i, start_output_j, n_unpadded_tiles,
0, n_output_channels, // Compute all channels
input_tensor, output_tensor, parameters, thread_working_space
@@ -250,6 +251,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
else
{
this->compute_tile_padded(
+ args,
start_output_i, start_output_j,
0, n_output_channels, // Compute all channels
input_tensor, output_tensor, parameters, thread_working_space
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
new file mode 100644
index 0000000000..c2b861000c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "depthwise_common.hpp"
+
+using arm_gemm::iceildiv;
+
+namespace arm_conv {
+namespace depthwise {
+
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(size_t out_size, size_t in_size, const size_t d,
+ const size_t dilation_factor,
+ const size_t kernel_size, const size_t stride,
+ const size_t orig_pad_before) {
+ // Get the valid output range
+ out_size = iceildiv(out_size - d, dilation_factor);
+
+ // Compute the start offset and the amount of padding which applies to this
+ // portion of the work.
+ size_t start_pos = d * stride, pad_before = 0;
+ if (start_pos < orig_pad_before) {
+ pad_before = iceildiv(orig_pad_before - start_pos, dilation_factor);
+ }
+ start_pos += pad_before * dilation_factor - orig_pad_before;
+
+ // Hence compute the valid input range
+ in_size = start_pos < in_size
+ ? iceildiv(in_size - start_pos, dilation_factor)
+ : 0;
+
+ // Finally, compute the "after" padding
+ const size_t reqd_input = (out_size - 1) * stride + kernel_size;
+ size_t pad_after = 0;
+ if (reqd_input > (pad_before + in_size)) {
+ pad_after = reqd_input - (pad_before + in_size);
+ }
+
+ return std::make_tuple(out_size, in_size, start_pos, pad_before, pad_after);
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
index 70b12919b0..2620b48e17 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -367,6 +367,7 @@ class DepthwiseDepthfirst
protected:
void compute_tile_padded(
+ const DepthwiseArgs &args,
unsigned int output_i, unsigned int output_j,
unsigned int output_channel_start, unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
@@ -379,13 +380,13 @@ class DepthwiseDepthfirst
auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
// Compute the input pointer array
- const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+ const auto input_channel_start = output_channel_start / args.channel_multiplier;
- const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
- const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
@@ -394,8 +395,8 @@ class DepthwiseDepthfirst
input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
input.ld_row, input.ld_col,
ws->input_buffer,
- input_pad_top, this->m_args.input_rows - input_i,
- input_pad_left, this->m_args.input_cols - input_j
+ input_pad_top, args.input_rows - input_i,
+ input_pad_left, args.input_cols - input_j
);
// Compute the output pointer array
@@ -404,8 +405,8 @@ class DepthwiseDepthfirst
output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
output.ld_row, output.ld_col,
ws->output_buffer,
- 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
- 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
);
// Execute the kernel
@@ -416,6 +417,7 @@ class DepthwiseDepthfirst
}
void compute_row_padded_tile_row(
+ const DepthwiseArgs &args,
const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
const unsigned int output_channel_start, const unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
@@ -430,19 +432,19 @@ class DepthwiseDepthfirst
const auto os = this->get_output_stage();
// Compute top and bottom padding; hence fill in the initial pointer arrays.
- const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
- const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const auto input_channel_start = output_channel_start / args.channel_multiplier;
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
- const auto input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left;
+ const auto input_j = output_j * args.stride_cols - args.padding.left;
// Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows
// available.
- const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, this->m_args.input_rows - input_i);
- const auto valid_output_rows = std::min(strat->get_output_rows(), this->m_args.output_rows - output_i);
+ const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i);
+ const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i);
- const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols;
+ const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
fill_pointer_array<const TInput>(
@@ -450,8 +452,8 @@ class DepthwiseDepthfirst
input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
input.ld_row, input.ld_col,
ws->input_buffer,
- input_pad_top, this->m_args.input_rows - input_i,
- 0, this->m_args.input_cols - input_j // No left padding
+ input_pad_top, args.input_rows - input_i,
+ 0, args.input_cols - input_j // No left padding
);
fill_pointer_array(
@@ -459,8 +461,8 @@ class DepthwiseDepthfirst
output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
output.ld_row, output.ld_col,
ws->output_buffer,
- 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
- 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
);
for (; n_tile_cols; n_tile_cols--)
@@ -492,6 +494,7 @@ class DepthwiseDepthfirst
}
void compute_tiles_unpadded(
+ const DepthwiseArgs &args,
unsigned int output_i, const unsigned int output_j,
unsigned int n_tile_rows, unsigned int n_tile_cols,
unsigned int output_channel_start, unsigned int output_channel_end,
@@ -511,8 +514,8 @@ class DepthwiseDepthfirst
// If the direct kernel is supported, then use it.
// Compute the base pointers we'll use in the tile.
auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
- const int start_input_i = output_i * this->m_args.stride_rows - this->m_args.padding.top;
- const int start_input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left;
+ const int start_input_i = output_i * args.stride_rows - args.padding.top;
+ const int start_input_j = output_j * args.stride_cols - args.padding.left;
auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
// Execute the kernel
@@ -528,10 +531,10 @@ class DepthwiseDepthfirst
{
// Otherwise, we repeatedly call the padded kernel but use our knowledge
// of the tensor structure to avoid recomputing the pointer array.
- const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+ const auto input_channel_start = output_channel_start / args.channel_multiplier;
const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
- const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols;
+ const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
@@ -539,16 +542,16 @@ class DepthwiseDepthfirst
// each subsequent tile we simply update the pointers.
for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
{
- const int input_i = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
- const int input_j = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const int input_i = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+ const int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
fill_pointer_array<const TInput>(
ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
input.ld_row, input.ld_col,
ws->input_buffer,
- 0, this->m_args.input_rows,
- 0, this->m_args.input_cols
+ 0, args.input_rows,
+ 0, args.input_cols
);
// Compute the output pointer array
@@ -557,8 +560,8 @@ class DepthwiseDepthfirst
output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
output.ld_row, output.ld_col,
ws->output_buffer,
- 0, this->m_args.output_rows,
- 0, this->m_args.output_cols
+ 0, args.output_rows,
+ 0, args.output_cols
);
for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
index 9f53f7cc6f..b058ce26f2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -248,6 +248,7 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWei
protected:
void compute_tile_padded(
+ const DepthwiseArgs &args,
unsigned int output_i, unsigned int output_j,
unsigned int channel_start, unsigned int channel_end,
const TensorSpec<const TInput *> &input,
@@ -259,24 +260,24 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWei
// Get the working space
WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
- const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
- const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
fill_pointer_array_generic_kernel<const TInput>(
ws->inptr_array,
this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
- this->m_args.kernel_rows, this->m_args.kernel_cols,
- this->m_args.stride_rows, this->m_args.stride_cols,
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols,
input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
input.ld_row, input.ld_col,
ws->input_buffer,
- input_pad_top, this->m_args.input_rows - input_i,
- input_pad_left, this->m_args.input_cols - input_j
+ input_pad_top, args.input_rows - input_i,
+ input_pad_left, args.input_cols - input_j
);
// Compute the output pointer array
@@ -285,15 +286,15 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWei
output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
output.ld_row, output.ld_col,
ws->output_buffer,
- 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
- 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
);
// Execute the kernel
DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
this->get_output_stage(), m_bias, parameters,
- this->m_args.kernel_rows * this->m_args.kernel_cols,
+ args.kernel_rows * args.kernel_cols,
channel_end - channel_start
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
index e58467b0f4..cef568fadd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -525,6 +525,7 @@ class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, T
}
void compute_tile_padded(
+ const DepthwiseArgs &args,
unsigned int output_i, unsigned int output_j,
unsigned int output_channel_start, unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
@@ -536,11 +537,11 @@ class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, T
// Get the working space
auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
- const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
- const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
@@ -551,40 +552,40 @@ class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, T
output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
output.ld_row, output.ld_col,
ws->output_buffer,
- 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
- 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
);
// Compute the parameter stride
- DepthwiseArgs single_iter(this->m_args);
+ DepthwiseArgs single_iter(args);
single_iter.input_channels = 1;
const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
->get_storage_size(single_iter);
for (; output_channel_start < output_channel_end;
- output_channel_start += this->m_args.channel_multiplier)
+ output_channel_start += args.channel_multiplier)
{
// Compute the input pointer array
- const auto input_channel = output_channel_start / this->m_args.channel_multiplier;
+ const auto input_channel = output_channel_start / args.channel_multiplier;
// Construct the input patch
depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
- this->m_args, ws, this->m_strat.get(),
+ args, ws, this->m_strat.get(),
input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
- input_pad_top, this->m_args.input_rows - input_i,
- input_pad_left, this->m_args.input_cols - input_j
+ input_pad_top, args.input_rows - input_i,
+ input_pad_left, args.input_cols - input_j
);
// Execute the kernel
depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
- this->m_args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
+ args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
parameters, m_bias
);
// Update the output pointers
for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
{
- ws->outptr_array[n] += this->m_args.channel_multiplier;
+ ws->outptr_array[n] += args.channel_multiplier;
}
// Progress the parameters
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index 1ee19e5075..0f91fe363c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -55,7 +55,9 @@ struct DepthwiseImplementation
DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
{
- return initialise(args, os);
+ auto impl = initialise(args, os);
+ impl->set_name(std::string(name));
+ return impl;
}
};
@@ -136,14 +138,7 @@ UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &a
{
const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
-
- if(success)
- {
- auto i = impl->get_instance(args, os);
- i->set_name(impl->name);
- return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i);
- }
- return nullptr;
+ return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
}
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
index f3160fba27..2b2e6f3555 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -332,18 +332,12 @@ class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
}
void execute_internal(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int n_input_channels,
- const PaddingValues &padding,
+ const DepthwiseArgs &args,
const void *input,
size_t ld_input_col,
size_t ld_input_row,
size_t ld_input_batch,
const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
void *output,
size_t ld_output_col,
size_t ld_output_row,
@@ -359,7 +353,7 @@ class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
this->initialise_working_space(thread_working_space);
auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
- const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+ const auto n_output_channels = args.input_channels * args.channel_multiplier;
const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
// Get typed pointers
@@ -368,23 +362,23 @@ class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
auto weights = reinterpret_cast<const TWeight *>(parameters);
// Iterate over batches
- for (; batches; batches--)
+ for (auto batches = args.n_batches; batches; batches--)
{
// NOTE: Other loop orderings are possible and it would be worth
// investigating them.
// Within a batch, stripe threads across rows.
for (auto start_output_i = thread_id * m_strat->get_output_rows();
- start_output_i < output_height;
+ start_output_i < args.output_rows;
start_output_i += n_threads * m_strat->get_output_rows())
{
// Determine what (if any padding) is required on the top/bottom of
// this row of the convolution.
- const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+ const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
- const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i;
- const unsigned int valid_output_rows = output_height - start_output_i;
+ const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
+ const unsigned int valid_output_rows = args.output_rows - start_output_i;
auto inptr_row = input_batch + input_i*ld_input_row;
auto outptr_row = output_batch + start_output_i * ld_output_row;
@@ -392,10 +386,10 @@ class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
// Execute the kernel
this->execute_kernel(
inptr_row, ld_input_row, ld_input_col, vl,
- input_pad_top, valid_input_rows, padding.left, input_width,
+ input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
weights, this->m_bias,
outptr_row, ld_output_row, ld_output_col, vl,
- valid_output_rows, output_width,
+ valid_output_rows, args.output_cols,
0 /* first channel */, n_output_channels,
ws
);
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index 3998dfbc9a..8eb278c22e 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,8 +38,8 @@ struct DepthwiseConfig
std::string filter = "";
DepthwiseConfig(DepthwiseMethod method)
- : method(method) {};
- DepthwiseConfig() {};
+ : method(method){};
+ DepthwiseConfig(){};
};
struct DepthwiseArgs
@@ -48,6 +48,7 @@ struct DepthwiseArgs
unsigned int kernel_rows, kernel_cols;
unsigned int stride_rows, stride_cols;
+ unsigned int dilation_rows, dilation_cols;
unsigned int n_batches, input_rows, input_cols, input_channels;
unsigned int output_rows, output_cols;
@@ -65,14 +66,48 @@ struct DepthwiseArgs
const CPUInfo *cpu_info,
unsigned int kernel_rows, unsigned int kernel_cols,
unsigned int stride_rows, unsigned int stride_cols,
+ unsigned int dilation_rows, unsigned int dilation_cols,
+ unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding, arm_gemm::Activation activation,
+
+ const DepthwiseConfig *config)
+ : cpu_info(cpu_info),
+ kernel_rows(kernel_rows),
+ kernel_cols(kernel_cols),
+ stride_rows(stride_rows),
+ stride_cols(stride_cols),
+ dilation_rows(dilation_rows),
+ dilation_cols(dilation_cols),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ input_channels(input_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ channel_multiplier(channel_multiplier),
+ padding(padding),
+ activation(activation),
+ config(config)
+ {
+ }
+
+ DepthwiseArgs(
+ const CPUInfo *cpu_info,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
unsigned int input_channels,
unsigned int output_rows, unsigned int output_cols,
unsigned int channel_multiplier,
PaddingValues padding, arm_gemm::Activation activation,
const DepthwiseConfig *config)
- : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), stride_rows(stride_rows), stride_cols(stride_cols), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
- input_channels(input_channels), output_rows(output_rows), output_cols(output_cols), channel_multiplier(channel_multiplier), padding(padding), activation(activation), config(config)
+ : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
+ stride_cols, 1, 1, n_batches, input_rows, input_cols,
+ input_channels, output_rows, output_cols,
+ channel_multiplier, padding, activation, config)
{
}
};
@@ -80,28 +115,30 @@ struct DepthwiseArgs
template <typename TInput, typename TWeight, typename TOutput>
class DepthwiseCommon : public IDepthwiseCommon
{
-private:
- std::string _name{};
-
-protected:
+ protected:
const DepthwiseArgs m_args; // Copy of arguments
+ std::string m_name{};
-public:
- std::string name() const
+ public:
+ DepthwiseCommon(const DepthwiseArgs &args)
+ : m_args(args){};
+ DepthwiseCommon(DepthwiseCommon &) = delete;
+ DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
+
+ std::string name() const override
{
- return _name;
+ return m_name;
}
- void set_name(const std::string &n)
+ void set_name(std::string name)
{
- _name = n;
+ // Only allow the name to be set once
+ if (m_name.empty())
+ {
+ m_name = name;
+ }
}
- DepthwiseCommon(const DepthwiseArgs &args)
- : m_args(args) {};
- DepthwiseCommon(DepthwiseCommon &) = delete;
- DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
-
void execute(
const void *const input,
const void *const parameters,
@@ -168,34 +205,77 @@ public:
unsigned int thread_id,
unsigned int n_threads) const override final
{
- this->execute_internal(
- batches, input_height, input_width, channels, padding, input,
- ld_input_col, ld_input_row, ld_input_batch, parameters, output_height,
- output_width, output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ // Construct a new set of arguments to reflect that we might have been
+ // passed different input/output tensors. Dilation is handled at this
+ // level; so we set the dilation in the arguments to zero.
+ DepthwiseArgs args(this->m_args);
+ args.n_batches = batches;
+ args.input_rows = input_height;
+ args.input_cols = input_width;
+ args.input_channels = channels;
+ args.output_rows = output_height;
+ args.output_cols = output_width;
+ args.padding = padding;
+ args.dilation_rows = args.dilation_cols = 1;
+
+ auto ld_input_col_d = ld_input_col * m_args.dilation_cols;
+ auto ld_input_row_d = ld_input_row * m_args.dilation_rows;
+ auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
+ auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
+
+ for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
+ {
+ size_t start_i;
+ std::tie(args.output_rows, args.input_rows, start_i,
+ args.padding.top, args.padding.bottom) =
+ get_reduced_view_for_dilation(
+ output_height, input_height, drow, m_args.dilation_rows,
+ m_args.kernel_rows, m_args.stride_rows, padding.top);
+
+ auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
+ auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
+
+ if (args.output_rows)
+ {
+ for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+ {
+ size_t start_j;
+ std::tie(args.output_cols, args.input_cols, start_j,
+ args.padding.left, args.padding.right) =
+ get_reduced_view_for_dilation(
+ output_width, input_width, dcol, m_args.dilation_cols,
+ m_args.kernel_cols, m_args.stride_cols, padding.left);
+
+ const TInput *input_col = input_row + start_j * ld_input_col;
+ TOutput *output_col = output_row + dcol * ld_output_col;
+
+ if (args.output_cols)
+ {
+ this->execute_internal(
+ args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
+ output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
+ working_space, thread_id, n_threads);
+ }
+ }
+ }
+ }
}
-protected:
+ protected:
virtual void execute_internal(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ const DepthwiseArgs &instance_args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index 52963ab357..fea6326897 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_gemm.hpp"
#include "common.hpp"
+#include <cstddef>
+#include <tuple>
namespace arm_conv
{
@@ -64,6 +66,9 @@ class IDepthwiseCommon
public:
virtual ~IDepthwiseCommon() = default;
+ // Get the name of the depthwise implementation
+ virtual std::string name() const = 0;
+
// Determine the amount of storage space required for the rearranged weights
// and bias.
virtual size_t get_storage_size(void) const = 0;
@@ -127,5 +132,25 @@ public:
unsigned int n_threads) const = 0;
};
+// To handle a dilation factor of D execute the kernel once for each d in
+// [0..D). Each `d` corresponds to a portion or "view" of the input and output
+// tensors. The output view corresponds to every Dth pixel starting from `d`;
+// this function computes how many pixels are covered. The input view consists
+// of an amount of before padding, every Dth pixel starting from an offset, and
+// some after padding. This function computes the start padding, input offset,
+// number of valid input pixels, and the after padding.
+//
+// Returns
+// - Number of valid output pixels corresponding to `d`
+// - Number of valid input pixels corresponding to `d`
+// - Offset of the first pixel corresponding to `d`
+// - Amount of padding in the view for `d`
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(
+ size_t out_size, size_t in_size,
+ size_t d, size_t dilation_factor,
+ size_t kernel_size, size_t stride,
+ size_t pad_before);
+
} // namespace depthwise
} // namespace arm_conv