aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/assembly/depthwise.hpp
diff options
context:
space:
mode:
authorPablo Marquez Tello <pablo.tello@arm.com>2023-01-09 17:21:01 +0000
committerPablo Marquez Tello <pablo.tello@arm.com>2023-02-08 11:05:08 +0000
commit4e2bbbbb23e6f4bd452f7f865e51228e1f51efec (patch)
tree36469f45f17d94f13bc1206e3a5975ba6cbccad5 /src/core/NEON/kernels/assembly/depthwise.hpp
parentfbe94da93b5be8745727ba7624b3d011e2bfa383 (diff)
downloadComputeLibrary-4e2bbbbb23e6f4bd452f7f865e51228e1f51efec.tar.gz
Add support for dilation > 1 in assembly DepthwiseConvolution
* Resolve COMPMID-5689 Change-Id: I81a3791ad054db59562b76d1c729f2b2168aee8b Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Signed-off-by: Andrew Mundy <andrew.mundy@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8919 Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/assembly/depthwise.hpp')
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp168
1 files changed, 124 insertions, 44 deletions
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index 3998dfbc9a..8eb278c22e 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,8 +38,8 @@ struct DepthwiseConfig
std::string filter = "";
DepthwiseConfig(DepthwiseMethod method)
- : method(method) {};
- DepthwiseConfig() {};
+ : method(method){};
+ DepthwiseConfig(){};
};
struct DepthwiseArgs
@@ -48,6 +48,7 @@ struct DepthwiseArgs
unsigned int kernel_rows, kernel_cols;
unsigned int stride_rows, stride_cols;
+ unsigned int dilation_rows, dilation_cols;
unsigned int n_batches, input_rows, input_cols, input_channels;
unsigned int output_rows, output_cols;
@@ -65,14 +66,48 @@ struct DepthwiseArgs
const CPUInfo *cpu_info,
unsigned int kernel_rows, unsigned int kernel_cols,
unsigned int stride_rows, unsigned int stride_cols,
+ unsigned int dilation_rows, unsigned int dilation_cols,
+ unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding, arm_gemm::Activation activation,
+
+ const DepthwiseConfig *config)
+ : cpu_info(cpu_info),
+ kernel_rows(kernel_rows),
+ kernel_cols(kernel_cols),
+ stride_rows(stride_rows),
+ stride_cols(stride_cols),
+ dilation_rows(dilation_rows),
+ dilation_cols(dilation_cols),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ input_channels(input_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ channel_multiplier(channel_multiplier),
+ padding(padding),
+ activation(activation),
+ config(config)
+ {
+ }
+
+ DepthwiseArgs(
+ const CPUInfo *cpu_info,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
unsigned int input_channels,
unsigned int output_rows, unsigned int output_cols,
unsigned int channel_multiplier,
PaddingValues padding, arm_gemm::Activation activation,
const DepthwiseConfig *config)
- : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), stride_rows(stride_rows), stride_cols(stride_cols), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
- input_channels(input_channels), output_rows(output_rows), output_cols(output_cols), channel_multiplier(channel_multiplier), padding(padding), activation(activation), config(config)
+ : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
+ stride_cols, 1, 1, n_batches, input_rows, input_cols,
+ input_channels, output_rows, output_cols,
+ channel_multiplier, padding, activation, config)
{
}
};
@@ -80,28 +115,30 @@ struct DepthwiseArgs
template <typename TInput, typename TWeight, typename TOutput>
class DepthwiseCommon : public IDepthwiseCommon
{
-private:
- std::string _name{};
-
-protected:
+ protected:
const DepthwiseArgs m_args; // Copy of arguments
+ std::string m_name{};
-public:
- std::string name() const
+ public:
+ DepthwiseCommon(const DepthwiseArgs &args)
+ : m_args(args){};
+ DepthwiseCommon(DepthwiseCommon &) = delete;
+ DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
+
+ std::string name() const override
{
- return _name;
+ return m_name;
}
- void set_name(const std::string &n)
+ void set_name(std::string name)
{
- _name = n;
+ // Only allow the name to be set once
+ if (m_name.empty())
+ {
+ m_name = name;
+ }
}
- DepthwiseCommon(const DepthwiseArgs &args)
- : m_args(args) {};
- DepthwiseCommon(DepthwiseCommon &) = delete;
- DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
-
void execute(
const void *const input,
const void *const parameters,
@@ -168,34 +205,77 @@ public:
unsigned int thread_id,
unsigned int n_threads) const override final
{
- this->execute_internal(
- batches, input_height, input_width, channels, padding, input,
- ld_input_col, ld_input_row, ld_input_batch, parameters, output_height,
- output_width, output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ // Construct a new set of arguments to reflect that we might have been
+ // passed different input/output tensors. Dilation is handled at this
+ // level; so we set the dilation in the arguments to zero.
+ DepthwiseArgs args(this->m_args);
+ args.n_batches = batches;
+ args.input_rows = input_height;
+ args.input_cols = input_width;
+ args.input_channels = channels;
+ args.output_rows = output_height;
+ args.output_cols = output_width;
+ args.padding = padding;
+ args.dilation_rows = args.dilation_cols = 1;
+
+ auto ld_input_col_d = ld_input_col * m_args.dilation_cols;
+ auto ld_input_row_d = ld_input_row * m_args.dilation_rows;
+ auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
+ auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
+
+ for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
+ {
+ size_t start_i;
+ std::tie(args.output_rows, args.input_rows, start_i,
+ args.padding.top, args.padding.bottom) =
+ get_reduced_view_for_dilation(
+ output_height, input_height, drow, m_args.dilation_rows,
+ m_args.kernel_rows, m_args.stride_rows, padding.top);
+
+ auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
+ auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
+
+ if (args.output_rows)
+ {
+ for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+ {
+ size_t start_j;
+ std::tie(args.output_cols, args.input_cols, start_j,
+ args.padding.left, args.padding.right) =
+ get_reduced_view_for_dilation(
+ output_width, input_width, dcol, m_args.dilation_cols,
+ m_args.kernel_cols, m_args.stride_cols, padding.left);
+
+ const TInput *input_col = input_row + start_j * ld_input_col;
+ TOutput *output_col = output_row + dcol * ld_output_col;
+
+ if (args.output_cols)
+ {
+ this->execute_internal(
+ args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
+ output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
+ working_space, thread_id, n_threads);
+ }
+ }
+ }
+ }
}
-protected:
+ protected:
virtual void execute_internal(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ const DepthwiseArgs &instance_args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>