aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/assembly
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/assembly')
-rw-r--r--src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h88
-rw-r--r--src/core/NEON/kernels/assembly/common.hpp34
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp351
-rw-r--r--src/core/NEON/kernels/assembly/depthwise_common.hpp146
-rw-r--r--src/core/NEON/kernels/assembly/pool_common.hpp85
-rw-r--r--src/core/NEON/kernels/assembly/pooling.hpp168
-rw-r--r--src/core/NEON/kernels/assembly/premultiply.hpp78
-rw-r--r--src/core/NEON/kernels/assembly/winograd.hpp265
8 files changed, 1048 insertions, 167 deletions
diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
deleted file mode 100644
index a956898403..0000000000
--- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** This class is a wrapper for the depthwise convolution assembly kernels. */
-class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEDepthwiseConvolutionAssemblyKernelWrapper";
- }
-
- /** Default constructor */
- NEDepthwiseConvolutionAssemblyKernelWrapper()
- : _kernel(nullptr)
- {
- }
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
- /** Default Move Constructor. */
- NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
- /** Default move assignment operator */
- NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-
- /** Initialise the kernel's input and output.
- *
- * @param[in] kernel Pointer to an assembly kernel implementation.
- */
- void configure(depthwise::IDepthwiseConvolution *kernel)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
- _kernel = kernel;
- Window win;
- win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
- INEKernel::configure(win);
- }
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- auto first = window.x().start();
- auto last = window.x().end();
- _kernel->run(first, last, info.thread_id);
- }
-
-private:
- depthwise::IDepthwiseConvolution *_kernel;
-};
-} // namespace arm_compute
-#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/common.hpp b/src/core/NEON/kernels/assembly/common.hpp
new file mode 100644
index 0000000000..d82d11cae0
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/common.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv
+{
+struct PaddingValues
+{
+ unsigned int left, top, right, bottom;
+};
+
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
new file mode 100644
index 0000000000..13c2d314e4
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "arm_gemm_local.hpp"
+#include "depthwise_common.hpp"
+#include "premultiply.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+struct DepthwiseConfig
+{
+ DepthwiseMethod method = DepthwiseMethod::DEFAULT;
+ std::string filter = "";
+
+ DepthwiseConfig(DepthwiseMethod method) : method(method){};
+ DepthwiseConfig(){};
+};
+
+struct DepthwiseArgs
+{
+ const CPUInfo *cpu_info;
+
+ unsigned int kernel_rows, kernel_cols;
+ unsigned int stride_rows, stride_cols;
+ unsigned int dilation_rows, dilation_cols;
+
+ unsigned int n_batches, input_rows, input_cols, input_channels;
+ unsigned int output_rows, output_cols;
+ unsigned int channel_multiplier;
+
+ PaddingValues padding;
+
+ arm_gemm::Activation activation;
+
+ const DepthwiseConfig *config;
+
+ bool fast_mode = false;
+
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int dilation_rows,
+ unsigned int dilation_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+
+ const DepthwiseConfig *config)
+ : cpu_info(cpu_info),
+ kernel_rows(kernel_rows),
+ kernel_cols(kernel_cols),
+ stride_rows(stride_rows),
+ stride_cols(stride_cols),
+ dilation_rows(dilation_rows),
+ dilation_cols(dilation_cols),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ input_channels(input_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ channel_multiplier(channel_multiplier),
+ padding(padding),
+ activation(activation),
+ config(config)
+ {
+ }
+
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+ const DepthwiseConfig *config)
+ : DepthwiseArgs(cpu_info,
+ kernel_rows,
+ kernel_cols,
+ stride_rows,
+ stride_cols,
+ 1,
+ 1,
+ n_batches,
+ input_rows,
+ input_cols,
+ input_channels,
+ output_rows,
+ output_cols,
+ channel_multiplier,
+ padding,
+ activation,
+ config)
+ {
+ }
+};
+
+template <typename TInput>
+struct Tile
+{
+ TInput *array;
+
+ unsigned int tile_rows = 0;
+ unsigned int tile_cols = 0;
+ unsigned int tile_channels = 0;
+
+ Tile(TInput *array, unsigned int tile_rows, unsigned int tile_cols, unsigned int tile_channels)
+ : array(array), tile_rows(tile_rows), tile_cols(tile_cols), tile_channels(tile_channels)
+ {
+ }
+
+ Tile() : Tile(nullptr, 0, 0, 0)
+ {
+ }
+
+ void load_from(const TInput *input,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ const unsigned int n_rows,
+ const unsigned int n_cols,
+ const int input_i,
+ const int input_j,
+ const unsigned int channel_multiplier) const
+ {
+ const auto pad_top = input_i < 0 ? -input_i : 0;
+ const auto pad_left = input_j < 0 ? -input_j : 0;
+
+ const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
+ const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
+
+ if (padded_rows < tile_rows || padded_cols < tile_cols)
+ {
+ memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
+ }
+
+ do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+ ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+ tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+ tile_channels / channel_multiplier, channel_multiplier);
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseCommon : public IDepthwiseCommon
+{
+protected:
+ const DepthwiseArgs m_args; // Copy of arguments
+ std::string m_name{};
+
+public:
+ DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+ DepthwiseCommon(DepthwiseCommon &) = delete;
+ DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
+
+ std::string name() const override
+ {
+ return m_name;
+ }
+
+ void set_name(std::string name)
+ {
+ // Only allow the name to be set once
+ if (m_name.empty())
+ {
+ m_name = name;
+ }
+ }
+
+ void execute(const void *const input,
+ const void *const parameters,
+ void *const output,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
+ {
+ const size_t ld_input_col = m_args.input_channels;
+ const size_t ld_input_row = ld_input_col * m_args.input_cols;
+ const size_t ld_input_batch = ld_input_row * m_args.input_rows;
+ const size_t ld_output_col = m_args.input_channels * m_args.channel_multiplier;
+ const size_t ld_output_row = ld_output_col * m_args.output_cols;
+ const size_t ld_output_batch = ld_output_row * m_args.output_rows;
+
+ execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, n_threads);
+ }
+
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *const parameters,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
+ {
+ execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+ ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
+ }
+
+ void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const override final
+ {
+ // Construct a new set of arguments to reflect that we might have been
+ // passed different input/output tensors. Dilation is handled at this
+ // level; so we set the dilation in the arguments to zero.
+ DepthwiseArgs args(this->m_args);
+ args.n_batches = batches;
+ args.input_rows = input_height;
+ args.input_cols = input_width;
+ args.input_channels = channels;
+ args.output_rows = output_height;
+ args.output_cols = output_width;
+ args.padding = padding;
+ args.dilation_rows = args.dilation_cols = 1;
+
+ auto ld_input_col_d = ld_input_col * m_args.dilation_cols;
+ auto ld_input_row_d = ld_input_row * m_args.dilation_rows;
+ auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
+ auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
+
+ for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
+ {
+ size_t start_i;
+ std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+ get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+ m_args.kernel_rows, m_args.stride_rows, padding.top);
+
+ auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
+ auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
+
+ if (args.output_rows)
+ {
+ for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+ {
+ size_t start_j;
+ std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+ get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+ m_args.kernel_cols, m_args.stride_cols, padding.left);
+
+ const TInput *input_col = input_row + start_j * ld_input_col;
+ TOutput *output_col = output_row + dcol * ld_output_col;
+
+ if (args.output_cols)
+ {
+ this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+ parameters, output_col, ld_output_col_d, ld_output_row_d,
+ ld_output_batch, working_space, thread_id, n_threads);
+ }
+ }
+ }
+ }
+ }
+
+protected:
+ virtual void execute_internal(const DepthwiseArgs &instance_args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual bool uses_premultiply() const
+ {
+ return true;
+ }
+};
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>
+using UniqueDepthwiseCommon = std::unique_ptr<DepthwiseCommon<TInput, TWeight, TOutput>>;
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+KernelDescription get_depthwise_method(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &, const OutputStage & = {});
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
new file mode 100644
index 0000000000..5ff848e281
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "common.hpp"
+#include <cstddef>
+#include <tuple>
+
+namespace arm_conv
+{
+namespace depthwise
+{
+using arm_gemm::Nothing;
+
+enum class DepthwiseMethod
+{
+ DEFAULT,
+ DEPTHFIRST,
+ PLANAR,
+};
+
+struct KernelDescription
+{
+ DepthwiseMethod method = DepthwiseMethod::DEFAULT;
+ std::string name = "";
+ bool is_default = false;
+ uint64_t cycle_estimate = 0;
+
+ KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
+ : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
+ {
+ }
+
+ KernelDescription() noexcept {};
+};
+
+class IDepthwiseCommon
+{
+public:
+ virtual ~IDepthwiseCommon() = default;
+
+ // Get the name of the depthwise implementation
+ virtual std::string name() const = 0;
+
+ // Determine the amount of storage space required for the rearranged weights
+ // and bias.
+ virtual size_t get_storage_size(void) const = 0;
+
+ // Rearrange the weights and biases into a storage buffer.
+ // Accepts a pointer to a buffer into which to store the packed parameters, a
+ // pointer the bias vector (which may be nullptr in the case of no bias) and
+ // a pointer to the array of weights (stored in HWIO order).
+ virtual void pack_parameters(
+ void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
+
+ // Determine the amount of working space required
+ virtual size_t get_working_size(unsigned int n_threads) const = 0;
+
+ // Execute the convolution over the specified area of memory.
+ virtual void execute(const void *input, // Pointer to input tensor
+ const void *parameters, // Packed parameters buffer
+ void *output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+};
+
+// To handle a dilation factor of D execute the kernel once for each d in
+// [0..D). Each `d` corresponds to a portion or "view" of the input and output
+// tensors. The output view corresponds to every Dth pixel starting from `d`;
+// this function computes how many pixels are covered. The input view consists
+// of an amount of before padding, every Dth pixel starting from an offset, and
+// some after padding. This function computes the start padding, input offset,
+// number of valid input pixels, and the after padding.
+//
+// Returns
+// - Number of valid output pixels corresponding to `d`
+// - Number of valid input pixels corresponding to `d`
+// - Offset of the first pixel corresponding to `d`
+// - Amount of padding in the view for `d`
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+ size_t in_size,
+ size_t d,
+ size_t dilation_factor,
+ size_t kernel_size,
+ size_t stride,
+ size_t pad_before);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index fdc18aef39..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,10 +23,9 @@
*/
#pragma once
-
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
-#endif // CYCLE_PROFILING
+#endif
namespace arm_conv
{
@@ -69,54 +68,42 @@ public:
virtual size_t get_working_size(unsigned int num_threads) const = 0;
// Execute pooling over the specified area of memory.
- virtual void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
-
- virtual void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
-};
+ virtual void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
-struct Nothing
-{
-};
-
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-class PoolingCommon : public IPoolingCommon
-{
+ virtual void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index 2325bd08ca..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,8 +27,6 @@
#include "arm_gemm_local.hpp"
#include "pool_common.hpp"
-#include <memory>
-
namespace arm_conv
{
namespace pooling
@@ -38,9 +36,8 @@ struct PoolingConfig
PoolingMethod method = PoolingMethod::DEFAULT;
std::string filter = "";
- PoolingConfig(PoolingMethod method)
- : method(method) {};
- PoolingConfig() {};
+ PoolingConfig(PoolingMethod method) : method(method){};
+ PoolingConfig(){};
};
struct PoolingArgs
@@ -59,36 +56,50 @@ struct PoolingArgs
const PoolingConfig *config;
- PoolingArgs(
- const CPUInfo *cpu_info,
- PoolingType pool_type,
- const PoolingWindow &window,
- const PoolingStride &stride,
- bool exclude_padding,
- unsigned int n_batches,
- unsigned int input_rows,
- unsigned int input_cols,
- unsigned int n_channels,
- unsigned int output_rows,
- unsigned int output_cols,
- const PaddingValues &padding,
- const PoolingConfig *cfg)
- : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
- n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+ PoolingArgs(const CPUInfo *cpu_info,
+ PoolingType pool_type,
+ const PoolingWindow &window,
+ const PoolingStride &stride,
+ bool exclude_padding,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int n_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ const PaddingValues &padding,
+ const PoolingConfig *cfg)
+ : cpu_info(cpu_info),
+ pool_type(pool_type),
+ pool_window(window),
+ pool_stride(stride),
+ exclude_padding(exclude_padding),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ n_channels(n_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ padding(padding),
+ config(cfg)
{
// If either of the pooling window dimensions are set to zero, meaning
// "pool everything", then replace with the corresponding input dimension.
- if(pool_window.rows == 0)
+ if (pool_window.rows == 0)
{
pool_window.rows = input_rows;
}
- if(pool_window.cols == 0)
+ if (pool_window.cols == 0)
{
pool_window.cols = input_cols;
}
}
};
+struct Nothing
+{
+};
+
struct Requantize32
{
int32_t input_offset = 0;
@@ -98,20 +109,117 @@ struct Requantize32
int32_t per_layer_right_shift = 0;
int32_t per_layer_mul = 0;
- Requantize32(int32_t input_offset, int32_t output_offset,
- int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+ Requantize32(int32_t input_offset,
+ int32_t output_offset,
+ int32_t per_layer_left_shift,
+ int32_t per_layer_right_shift,
int32_t per_layer_mul)
- : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+ : input_offset(input_offset),
+ output_offset(output_offset),
+ per_layer_left_shift(per_layer_left_shift),
+ per_layer_right_shift(per_layer_right_shift),
+ per_layer_mul(per_layer_mul)
{
}
};
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput, OutputStage>>;
+template <typename TInput, typename TOutput>
+class PoolingCommon : public IPoolingCommon
+{
+protected:
+ const PoolingArgs m_args;
+
+public:
+ PoolingCommon(const PoolingArgs &args) : m_args(args)
+ {
+ }
+ PoolingCommon(PoolingCommon &) = delete;
+ PoolingCommon &operator=(PoolingCommon &) = delete;
+
+ size_t get_working_size(unsigned int) const override = 0;
+
+ // Execute pooling over the specified area of memory.
+ void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
+ {
+ this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+ m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+ m_args.n_channels * m_args.output_cols,
+ m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+ num_threads);
+ }
+
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
+ {
+ this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+ ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
+ }
+
+ void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &padding,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
+ {
+ this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+ ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, num_threads);
+ }
+
+protected:
+ virtual void execute_internal(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
+};
+
+template <typename TInput, typename TOutput>
+using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput>>;
// Get a pooling engine
template <typename TInput, typename TOutput = TInput, class OutputStage = Nothing>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &, const OutputStage & = {});
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &, const OutputStage & = {});
} // namespace pooling
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
new file mode 100644
index 0000000000..fb97cf8baf
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+void do_premultiply_float_6(const float *in_ptr,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ float *out_ptr,
+ const unsigned int out_ld_row,
+ const unsigned int out_ld_col,
+ const unsigned int tile_rows,
+ const unsigned int tile_cols,
+ const unsigned input_channels);
+
+template <typename T>
+void do_premultiply(const T *in_ptr,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ T *out_ptr,
+ const unsigned int out_ld_row,
+ const unsigned int out_ld_col,
+ const unsigned int tile_rows,
+ const unsigned int tile_cols,
+ const unsigned input_channels,
+ const unsigned int channel_multiplier)
+{
+ if (sizeof(T) == 4 && channel_multiplier == 6)
+ {
+ do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+ tile_rows, tile_cols, input_channels);
+ }
+ else
+ {
+ for (unsigned int i = 0; i < tile_rows; i++)
+ {
+ const T *ip2 = in_ptr + i * ld_row;
+ T *op2 = out_ptr + i * out_ld_row;
+ for (unsigned int j = 0; j < tile_cols; j++)
+ {
+ const T *ip = ip2;
+ T *op = op2;
+ for (unsigned int c = 0; c < input_channels; c++)
+ {
+ T val = *ip;
+ ip++;
+
+ for (unsigned int r = 0; r < channel_multiplier; r++)
+ {
+ op[r] = val;
+ }
+ op += channel_multiplier;
+ }
+ ip2 += ld_col;
+ op2 += out_ld_col;
+ }
+ }
+ }
+}
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
new file mode 100644
index 0000000000..dbf95d23cd
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+
+namespace arm_conv
+{
+struct Shape2D
+{
+ unsigned int rows, cols;
+};
+
+struct ConvolutionArgs
+{
+ unsigned int n_batches;
+ Shape2D input_shape;
+ unsigned int n_input_channels;
+ unsigned int pad_top, pad_left;
+ Shape2D output_shape;
+ unsigned int n_output_channels;
+ Shape2D kernel_shape;
+ arm_gemm::Activation activation;
+
+ ConvolutionArgs(unsigned int n_batches,
+ const Shape2D &input_shape,
+ unsigned int n_input_channels,
+ unsigned int pad_top,
+ unsigned int pad_left,
+ const Shape2D &output_shape,
+ unsigned int n_output_channels,
+ const Shape2D kernel_shape,
+ const arm_gemm::Activation &activation = {})
+ : n_batches(n_batches),
+ input_shape(input_shape),
+ n_input_channels(n_input_channels),
+ pad_top(pad_top),
+ pad_left(pad_left),
+ output_shape(output_shape),
+ n_output_channels(n_output_channels),
+ kernel_shape(kernel_shape),
+ activation(activation)
+ {
+ }
+};
+
+namespace winograd
+{
+/* Constrain the selected Winograd implementation.
+ */
+struct WinogradConfig
+{
+ unsigned int output_rows = 0, output_cols = 0;
+ std::string input_transform_filter = "";
+ std::string output_transform_filter = "";
+ std::string weight_transform_filter = "";
+};
+
+/* Struct describing (suggested) memory layout within the Winograd domain.
+ */
+struct WinogradDomainSpec
+{
+ size_t weight_matrix_size_bytes, input_matrix_size_bytes, output_matrix_size_bytes;
+
+ size_t weight_ld_matrix, weight_ld_row;
+ size_t input_ld_batch, input_ld_matrix, input_ld_row;
+ size_t output_ld_batch, output_ld_matrix, output_ld_row;
+};
+
+class ITransformCommon
+{
+public:
+ virtual ~ITransformCommon() = default;
+
+ // Get the name of the transform
+ virtual const std::string &get_name(void) const = 0;
+};
+
+namespace weight_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+ ~ITransform() = default;
+
+ virtual unsigned int get_kernel_rows(void) const = 0;
+ virtual unsigned int get_kernel_cols(void) const = 0;
+
+ virtual unsigned int get_transformed_tile_rows(void) const = 0;
+ virtual unsigned int get_transformed_tile_cols(void) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ unsigned int thread_id,
+ unsigned int n_threads) const
+ {
+ this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+ wds.weight_ld_row, thread_id, n_threads);
+ }
+
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+};
+
+} // namespace weight_transform
+
+namespace input_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+ ~ITransform() = default;
+
+ virtual unsigned int get_input_rows(void) const = 0;
+ virtual unsigned int get_input_cols(void) const = 0;
+
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
+ {
+ this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+ wds.input_ld_row, working_space, thread_id, n_threads);
+ }
+
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+};
+
+} // namespace input_transform
+
+namespace output_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+ ~ITransform() = default;
+
+ virtual unsigned int get_input_rows(void) const = 0;
+ virtual unsigned int get_input_cols(void) const = 0;
+
+ virtual unsigned int get_output_rows(void) const = 0;
+ virtual unsigned int get_output_cols(void) const = 0;
+
+ virtual unsigned int get_kernel_rows(void) const = 0;
+ virtual unsigned int get_kernel_cols(void) const = 0;
+
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ const WinogradDomainSpec &wds,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
+ {
+ this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+ ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
+ }
+
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_matrix,
+ size_t ld_in_row,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+};
+
+} // namespace output_transform
+
+struct WinogradImpl
+{
+ const output_transform::ITransform *output_transform = nullptr;
+ const weight_transform::ITransform *weight_transform = nullptr;
+ const input_transform::ITransform *input_transform = nullptr;
+ std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
+ WinogradDomainSpec winograd_spec;
+};
+
+/* Get pointers to Winograd transforms for the given convolution problem.
+ *
+ * Assigns to the pointers in the `dest` struct and returns true or false to
+ * indicate whether the given problem can be executed or not.
+ */
+template <typename TIn,
+ typename TWeight = TIn,
+ typename TOut = TIn,
+ typename TWinogradIn = TIn,
+ typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+ const CPUInfo *,
+ const ConvolutionArgs &,
+ int max_threads,
+ bool fast_mode,
+ const WinogradConfig *,
+ const arm_gemm::GemmConfig *);
+
+} // namespace winograd
+} // namespace arm_conv