8 files changed, 1048 insertions, 167 deletions
diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
deleted file mode 100644
index a956898403..0000000000
--- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
-    }
-
-    /** Default constructor */
-    NEDepthwiseConvolutionAssemblyKernelWrapper()
-        : _kernel(nullptr)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel Pointer to an assembly kernel implementation.
-     */
-    void configure(depthwise::IDepthwiseConvolution *kernel)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-        Window win;
-        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
-        INEKernel::configure(win);
-    }
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-        auto first = window.x().start();
-        auto last  = window.x().end();
-        _kernel->run(first, last, info.thread_id);
-    }
-
-private:
-    depthwise::IDepthwiseConvolution *_kernel;
-};
-} // namespace arm_compute
-#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/common.hpp b/src/core/NEON/kernels/assembly/common.hpp
new file mode 100644
index 0000000000..d82d11cae0
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/common.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv
+{
+struct PaddingValues
+{
+    unsigned int left, top, right, bottom;
+};
+
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
new file mode 100644
index 0000000000..13c2d314e4
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "arm_gemm_local.hpp"
+#include "depthwise_common.hpp"
+#include "premultiply.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+struct DepthwiseConfig
+{
+    DepthwiseMethod method = DepthwiseMethod::DEFAULT;
+    std::string     filter = "";
+
+    DepthwiseConfig(DepthwiseMethod method) : method(method){};
+    DepthwiseConfig(){};
+};
+
+struct DepthwiseArgs
+{
+    const CPUInfo *cpu_info;
+
+    unsigned int kernel_rows, kernel_cols;
+    unsigned int stride_rows, stride_cols;
+    unsigned int dilation_rows, dilation_cols;
+
+    unsigned int n_batches, input_rows, input_cols, input_channels;
+    unsigned int output_rows, output_cols;
+    unsigned int channel_multiplier;
+
+    PaddingValues padding;
+
+    arm_gemm::Activation activation;
+
+    const DepthwiseConfig *config;
+
+    bool fast_mode = false;
+
+    DepthwiseArgs(const CPUInfo       *cpu_info,
+                  unsigned int         kernel_rows,
+                  unsigned int         kernel_cols,
+                  unsigned int         stride_rows,
+                  unsigned int         stride_cols,
+                  unsigned int         dilation_rows,
+                  unsigned int         dilation_cols,
+                  unsigned int         n_batches,
+                  unsigned int         input_rows,
+                  unsigned int         input_cols,
+                  unsigned int         input_channels,
+                  unsigned int         output_rows,
+                  unsigned int         output_cols,
+                  unsigned int         channel_multiplier,
+                  PaddingValues        padding,
+                  arm_gemm::Activation activation,
+
+                  const DepthwiseConfig *config)
+        : cpu_info(cpu_info),
+          kernel_rows(kernel_rows),
+          kernel_cols(kernel_cols),
+          stride_rows(stride_rows),
+          stride_cols(stride_cols),
+          dilation_rows(dilation_rows),
+          dilation_cols(dilation_cols),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          input_channels(input_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          channel_multiplier(channel_multiplier),
+          padding(padding),
+          activation(activation),
+          config(config)
+    {
+    }
+
+    DepthwiseArgs(const CPUInfo         *cpu_info,
+                  unsigned int           kernel_rows,
+                  unsigned int           kernel_cols,
+                  unsigned int           stride_rows,
+                  unsigned int           stride_cols,
+                  unsigned int           n_batches,
+                  unsigned int           input_rows,
+                  unsigned int           input_cols,
+                  unsigned int           input_channels,
+                  unsigned int           output_rows,
+                  unsigned int           output_cols,
+                  unsigned int           channel_multiplier,
+                  PaddingValues          padding,
+                  arm_gemm::Activation   activation,
+                  const DepthwiseConfig *config)
+        : DepthwiseArgs(cpu_info,
+                        kernel_rows,
+                        kernel_cols,
+                        stride_rows,
+                        stride_cols,
+                        1,
+                        1,
+                        n_batches,
+                        input_rows,
+                        input_cols,
+                        input_channels,
+                        output_rows,
+                        output_cols,
+                        channel_multiplier,
+                        padding,
+                        activation,
+                        config)
+    {
+    }
+};
+
+template <typename TInput>
+struct Tile
+{
+    TInput *array;
+
+    unsigned int tile_rows     = 0;
+    unsigned int tile_cols     = 0;
+    unsigned int tile_channels = 0;
+
+    Tile(TInput *array, unsigned int tile_rows, unsigned int tile_cols, unsigned int tile_channels)
+        : array(array), tile_rows(tile_rows), tile_cols(tile_cols), tile_channels(tile_channels)
+    {
+    }
+
+    Tile() : Tile(nullptr, 0, 0, 0)
+    {
+    }
+
+    void load_from(const TInput      *input,
+                   const unsigned int ld_row,
+                   const unsigned int ld_col,
+                   const unsigned int n_rows,
+                   const unsigned int n_cols,
+                   const int          input_i,
+                   const int          input_j,
+                   const unsigned int channel_multiplier) const
+    {
+        const auto pad_top  = input_i < 0 ? -input_i : 0;
+        const auto pad_left = input_j < 0 ? -input_j : 0;
+
+        const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
+        const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
+
+        if (padded_rows < tile_rows || padded_cols < tile_cols)
+        {
+            memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
+        }
+
+        do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+                               ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+                               tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+                               tile_channels / channel_multiplier, channel_multiplier);
+    }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseCommon : public IDepthwiseCommon
+{
+protected:
+    const DepthwiseArgs m_args; // Copy of arguments
+    std::string         m_name{};
+
+public:
+    DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
+    DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
+
+    std::string name() const override
+    {
+        return m_name;
+    }
+
+    void set_name(std::string name)
+    {
+        // Only allow the name to be set once
+        if (m_name.empty())
+        {
+            m_name = name;
+        }
+    }
+
+    void execute(const void *const  input,
+                 const void *const  parameters,
+                 void *const        output,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
+    {
+        const size_t ld_input_col    = m_args.input_channels;
+        const size_t ld_input_row    = ld_input_col * m_args.input_cols;
+        const size_t ld_input_batch  = ld_input_row * m_args.input_rows;
+        const size_t ld_output_col   = m_args.input_channels * m_args.channel_multiplier;
+        const size_t ld_output_row   = ld_output_col * m_args.output_cols;
+        const size_t ld_output_batch = ld_output_row * m_args.output_rows;
+
+        execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+                ld_output_batch, working_space, thread_id, n_threads);
+    }
+
+    void execute(const void *const  input,
+                 size_t             ld_input_col,
+                 size_t             ld_input_row,
+                 size_t             ld_input_batch,
+                 const void *const  parameters,
+                 void *const        output,
+                 size_t             ld_output_col,
+                 size_t             ld_output_row,
+                 size_t             ld_output_batch,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
+    {
+        execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+                ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+                ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
+    }
+
+    void execute(unsigned int         batches,
+                 unsigned int         input_height,
+                 unsigned int         input_width,
+                 unsigned int         channels,
+                 const PaddingValues &padding,
+                 const void          *input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const void          *parameters,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void                *output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         n_threads) const override final
+    {
+        // Construct a new set of arguments to reflect that we might have been
+        // passed different input/output tensors. Dilation is handled at this
+        // level; so we set the dilation in the arguments to zero.
+        DepthwiseArgs args(this->m_args);
+        args.n_batches      = batches;
+        args.input_rows     = input_height;
+        args.input_cols     = input_width;
+        args.input_channels = channels;
+        args.output_rows    = output_height;
+        args.output_cols    = output_width;
+        args.padding        = padding;
+        args.dilation_rows = args.dilation_cols = 1;
+
+        auto ld_input_col_d  = ld_input_col * m_args.dilation_cols;
+        auto ld_input_row_d  = ld_input_row * m_args.dilation_rows;
+        auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
+        auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
+
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        {
+            size_t start_i;
+            std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+                                              m_args.kernel_rows, m_args.stride_rows, padding.top);
+
+            auto input_row  = static_cast<const TInput *>(input) + start_i * ld_input_row;
+            auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
+
+            if (args.output_rows)
+            {
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                {
+                    size_t start_j;
+                    std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+                                                      m_args.kernel_cols, m_args.stride_cols, padding.left);
+
+                    const TInput *input_col  = input_row + start_j * ld_input_col;
+                    TOutput      *output_col = output_row + dcol * ld_output_col;
+
+                    if (args.output_cols)
+                    {
+                        this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+                                               parameters, output_col, ld_output_col_d, ld_output_row_d,
+                                               ld_output_batch, working_space, thread_id, n_threads);
+                    }
+                }
+            }
+        }
+    }
+
+protected:
+    virtual void execute_internal(const DepthwiseArgs &instance_args,
+                                  const void          *input,
+                                  size_t               ld_input_col,
+                                  size_t               ld_input_row,
+                                  size_t               ld_input_batch,
+                                  const void          *parameters,
+                                  void                *output,
+                                  size_t               ld_output_col,
+                                  size_t               ld_output_row,
+                                  size_t               ld_output_batch,
+                                  void                *working_space,
+                                  unsigned int         thread_id,
+                                  unsigned int         n_threads) const = 0;
+
+    virtual bool uses_premultiply() const
+    {
+        return true;
+    }
+};
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>
+using UniqueDepthwiseCommon = std::unique_ptr<DepthwiseCommon<TInput, TWeight, TOutput>>;
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+KernelDescription get_depthwise_method(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &, const OutputStage & = {});
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
new file mode 100644
index 0000000000..5ff848e281
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "common.hpp"
+#include <cstddef>
+#include <tuple>
+
+namespace arm_conv
+{
+namespace depthwise
+{
+using arm_gemm::Nothing;
+
+enum class DepthwiseMethod
+{
+    DEFAULT,
+    DEPTHFIRST,
+    PLANAR,
+};
+
+struct KernelDescription
+{
+    DepthwiseMethod method         = DepthwiseMethod::DEFAULT;
+    std::string     name           = "";
+    bool            is_default     = false;
+    uint64_t        cycle_estimate = 0;
+
+    KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
+        : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
+    {
+    }
+
+    KernelDescription() noexcept {};
+};
+
+class IDepthwiseCommon
+{
+public:
+    virtual ~IDepthwiseCommon() = default;
+
+    // Get the name of the depthwise implementation
+    virtual std::string name() const = 0;
+
+    // Determine the amount of storage space required for the rearranged weights
+    // and bias.
+    virtual size_t get_storage_size(void) const = 0;
+
+    // Rearrange the weights and biases into a storage buffer.
+    // Accepts a pointer to a buffer into which to store the packed parameters, a
+    // pointer the bias vector (which may be nullptr in the case of no bias) and
+    // a pointer to the array of weights (stored in HWIO order).
+    virtual void pack_parameters(
+        void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
+
+    // Determine the amount of working space required
+    virtual size_t get_working_size(unsigned int n_threads) const = 0;
+
+    // Execute the convolution over the specified area of memory.
+    virtual void execute(const void  *input,      // Pointer to input tensor
+                         const void  *parameters, // Packed parameters buffer
+                         void        *output,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(unsigned int batches,
+                         unsigned int input_height,
+                         unsigned int input_width,
+                         unsigned int channels,
+                         const PaddingValues &,
+                         const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+};
+
+// To handle a dilation factor of D execute the kernel once for each d in
+// [0..D). Each `d` corresponds to a portion or "view" of the input and output
+// tensors. The output view corresponds to every Dth pixel starting from `d`;
+// this function computes how many pixels are covered. The input view consists
+// of an amount of before padding, every Dth pixel starting from an offset, and
+// some after padding.  This function computes the start padding, input offset,
+// number of valid input pixels, and the after padding.
+//
+// Returns
+// - Number of valid output pixels corresponding to `d`
+// - Number of valid input pixels corresponding to `d`
+// - Offset of the first pixel corresponding to `d`
+// - Amount of padding in the view for `d`
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+                                                                                 size_t in_size,
+                                                                                 size_t d,
+                                                                                 size_t dilation_factor,
+                                                                                 size_t kernel_size,
+                                                                                 size_t stride,
+                                                                                 size_t pad_before);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index fdc18aef39..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,9 @@
  */
 
 #pragma once
-
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
-#endif // CYCLE_PROFILING
+#endif
 
 namespace arm_conv
 {
@@ -69,54 +68,42 @@ public:
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
 
     // Execute pooling over the specified area of memory.
-    virtual void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
-
-    virtual void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         void *const       output,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        unsigned int      batches,
-        unsigned int      height,
-        unsigned int      width,
-        unsigned int      channels,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        const PaddingValues &,
-        unsigned int output_height,
-        unsigned int output_width,
-        void *const  output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int num_threads) const = 0;
-};
+    virtual void execute(const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         void *const       output,
+                         size_t            ld_output_col,
+                         size_t            ld_output_row,
+                         size_t            ld_output_batch,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-struct Nothing
-{
-};
-
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-class PoolingCommon : public IPoolingCommon
-{
+    virtual void execute(unsigned int      batches,
+                         unsigned int      height,
+                         unsigned int      width,
+                         unsigned int      channels,
+                         const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         const PaddingValues &,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void *const  output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int num_threads) const = 0;
 };
 
 } // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index 2325bd08ca..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_gemm_local.hpp"
 #include "pool_common.hpp"
 
-#include <memory>
-
 namespace arm_conv
 {
 namespace pooling
@@ -38,9 +36,8 @@ struct PoolingConfig
     PoolingMethod method = PoolingMethod::DEFAULT;
     std::string   filter = "";
 
-    PoolingConfig(PoolingMethod method)
-        : method(method) {};
-    PoolingConfig() {};
+    PoolingConfig(PoolingMethod method) : method(method){};
+    PoolingConfig(){};
 };
 
 struct PoolingArgs
@@ -59,36 +56,50 @@ struct PoolingArgs
 
     const PoolingConfig *config;
 
-    PoolingArgs(
-        const CPUInfo       *cpu_info,
-        PoolingType          pool_type,
-        const PoolingWindow &window,
-        const PoolingStride &stride,
-        bool                 exclude_padding,
-        unsigned int         n_batches,
-        unsigned int         input_rows,
-        unsigned int         input_cols,
-        unsigned int         n_channels,
-        unsigned int         output_rows,
-        unsigned int         output_cols,
-        const PaddingValues &padding,
-        const PoolingConfig *cfg)
-        : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+    PoolingArgs(const CPUInfo       *cpu_info,
+                PoolingType          pool_type,
+                const PoolingWindow &window,
+                const PoolingStride &stride,
+                bool                 exclude_padding,
+                unsigned int         n_batches,
+                unsigned int         input_rows,
+                unsigned int         input_cols,
+                unsigned int         n_channels,
+                unsigned int         output_rows,
+                unsigned int         output_cols,
+                const PaddingValues &padding,
+                const PoolingConfig *cfg)
+        : cpu_info(cpu_info),
+          pool_type(pool_type),
+          pool_window(window),
+          pool_stride(stride),
+          exclude_padding(exclude_padding),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          n_channels(n_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          padding(padding),
+          config(cfg)
     {
         // If either of the pooling window dimensions are set to zero, meaning
         // "pool everything", then replace with the corresponding input dimension.
-        if(pool_window.rows == 0)
+        if (pool_window.rows == 0)
         {
             pool_window.rows = input_rows;
         }
-        if(pool_window.cols == 0)
+        if (pool_window.cols == 0)
         {
             pool_window.cols = input_cols;
         }
     }
 };
 
+struct Nothing
+{
+};
+
 struct Requantize32
 {
     int32_t input_offset  = 0;
@@ -98,20 +109,117 @@ struct Requantize32
     int32_t per_layer_right_shift = 0;
     int32_t per_layer_mul         = 0;
 
-    Requantize32(int32_t input_offset, int32_t output_offset,
-                 int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+    Requantize32(int32_t input_offset,
+                 int32_t output_offset,
+                 int32_t per_layer_left_shift,
+                 int32_t per_layer_right_shift,
                  int32_t per_layer_mul)
-        : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+        : input_offset(input_offset),
+          output_offset(output_offset),
+          per_layer_left_shift(per_layer_left_shift),
+          per_layer_right_shift(per_layer_right_shift),
+          per_layer_mul(per_layer_mul)
     {
     }
 };
 
-template <typename TInput, typename TOutput, class OutputStage = Nothing>
-using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput, OutputStage>>;
+template <typename TInput, typename TOutput>
+class PoolingCommon : public IPoolingCommon
+{
+protected:
+    const PoolingArgs m_args;
+
+public:
+    PoolingCommon(const PoolingArgs &args) : m_args(args)
+    {
+    }
+    PoolingCommon(PoolingCommon &)            = delete;
+    PoolingCommon &operator=(PoolingCommon &) = delete;
+
+    size_t get_working_size(unsigned int) const override = 0;
+
+    // Execute pooling over the specified area of memory.
+    void execute(const void *const input,
+                 void *const       output,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
+    {
+        this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+                      m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+                      m_args.n_channels * m_args.output_cols,
+                      m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+                      num_threads);
+    }
+
+    void execute(const void *const input,
+                 size_t            ld_input_col,
+                 size_t            ld_input_row,
+                 size_t            ld_input_batch,
+                 void *const       output,
+                 size_t            ld_output_col,
+                 size_t            ld_output_row,
+                 size_t            ld_output_batch,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
+    {
+        this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+                      ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+                      ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
+    }
+
+    void execute(unsigned int         batches,
+                 unsigned int         height,
+                 unsigned int         width,
+                 unsigned int         channels,
+                 const void *const    input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const PaddingValues &padding,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void *const          output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         num_threads) const override
+    {
+        this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+                               ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+                               ld_output_batch, working_space, thread_id, num_threads);
+    }
+
+protected:
+    virtual void execute_internal(unsigned int batches,
+                                  unsigned int height,
+                                  unsigned int width,
+                                  unsigned int channels,
+                                  const PaddingValues &,
+                                  const void *const input,
+                                  size_t            ld_input_col,
+                                  size_t            ld_input_row,
+                                  size_t            ld_input_batch,
+                                  unsigned int      output_height,
+                                  unsigned int      output_width,
+                                  void *const       output,
+                                  size_t            ld_output_col,
+                                  size_t            ld_output_row,
+                                  size_t            ld_output_batch,
+                                  void             *working_space,
+                                  unsigned int      thread_id,
+                                  unsigned int      num_threads) const = 0;
+};
+
+template <typename TInput, typename TOutput>
+using UniquePoolingCommon = std::unique_ptr<PoolingCommon<TInput, TOutput>>;
 
 // Get a pooling engine
 template <typename TInput, typename TOutput = TInput, class OutputStage = Nothing>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &, const OutputStage & = {});
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &, const OutputStage & = {});
 
 } // namespace pooling
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
new file mode 100644
index 0000000000..fb97cf8baf
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+void do_premultiply_float_6(const float       *in_ptr,
+                            const unsigned int ld_row,
+                            const unsigned int ld_col,
+                            float             *out_ptr,
+                            const unsigned int out_ld_row,
+                            const unsigned int out_ld_col,
+                            const unsigned int tile_rows,
+                            const unsigned int tile_cols,
+                            const unsigned     input_channels);
+
+template <typename T>
+void do_premultiply(const T           *in_ptr,
+                    const unsigned int ld_row,
+                    const unsigned int ld_col,
+                    T                 *out_ptr,
+                    const unsigned int out_ld_row,
+                    const unsigned int out_ld_col,
+                    const unsigned int tile_rows,
+                    const unsigned int tile_cols,
+                    const unsigned     input_channels,
+                    const unsigned int channel_multiplier)
+{
+    if (sizeof(T) == 4 && channel_multiplier == 6)
+    {
+        do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+                               tile_rows, tile_cols, input_channels);
+    }
+    else
+    {
+        for (unsigned int i = 0; i < tile_rows; i++)
+        {
+            const T *ip2 = in_ptr + i * ld_row;
+            T       *op2 = out_ptr + i * out_ld_row;
+            for (unsigned int j = 0; j < tile_cols; j++)
+            {
+                const T *ip = ip2;
+                T       *op = op2;
+                for (unsigned int c = 0; c < input_channels; c++)
+                {
+                    T val = *ip;
+                    ip++;
+
+                    for (unsigned int r = 0; r < channel_multiplier; r++)
+                    {
+                        op[r] = val;
+                    }
+                    op += channel_multiplier;
+                }
+                ip2 += ld_col;
+                op2 += out_ld_col;
+            }
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
new file mode 100644
index 0000000000..dbf95d23cd
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+
+namespace arm_conv
+{
+struct Shape2D
+{
+    unsigned int rows, cols;
+};
+
+struct ConvolutionArgs
+{
+    unsigned int         n_batches;
+    Shape2D              input_shape;
+    unsigned int         n_input_channels;
+    unsigned int         pad_top, pad_left;
+    Shape2D              output_shape;
+    unsigned int         n_output_channels;
+    Shape2D              kernel_shape;
+    arm_gemm::Activation activation;
+
+    ConvolutionArgs(unsigned int                n_batches,
+                    const Shape2D              &input_shape,
+                    unsigned int                n_input_channels,
+                    unsigned int                pad_top,
+                    unsigned int                pad_left,
+                    const Shape2D              &output_shape,
+                    unsigned int                n_output_channels,
+                    const Shape2D               kernel_shape,
+                    const arm_gemm::Activation &activation = {})
+        : n_batches(n_batches),
+          input_shape(input_shape),
+          n_input_channels(n_input_channels),
+          pad_top(pad_top),
+          pad_left(pad_left),
+          output_shape(output_shape),
+          n_output_channels(n_output_channels),
+          kernel_shape(kernel_shape),
+          activation(activation)
+    {
+    }
+};
+
+namespace winograd
+{
+/* Constrain the selected Winograd implementation.
+ */
+struct WinogradConfig
+{
+    unsigned int output_rows = 0, output_cols = 0;
+    std::string  input_transform_filter  = "";
+    std::string  output_transform_filter = "";
+    std::string  weight_transform_filter = "";
+};
+
+/* Struct describing (suggested) memory layout within the Winograd domain.
+ */
+struct WinogradDomainSpec
+{
+    size_t weight_matrix_size_bytes, input_matrix_size_bytes, output_matrix_size_bytes;
+
+    size_t weight_ld_matrix, weight_ld_row;
+    size_t input_ld_batch, input_ld_matrix, input_ld_row;
+    size_t output_ld_batch, output_ld_matrix, output_ld_row;
+};
+
+class ITransformCommon
+{
+public:
+    virtual ~ITransformCommon() = default;
+
+    // Get the name of the transform
+    virtual const std::string &get_name(void) const = 0;
+};
+
+namespace weight_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+    ~ITransform() = default;
+
+    virtual unsigned int get_kernel_rows(void) const = 0;
+    virtual unsigned int get_kernel_cols(void) const = 0;
+
+    virtual unsigned int get_transformed_tile_rows(void) const = 0;
+    virtual unsigned int get_transformed_tile_cols(void) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 size_t                    ld_input_channel,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
+    {
+        this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+                      wds.weight_ld_row, thread_id, n_threads);
+    }
+
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         size_t                 ld_input_channel,
+                         void                  *outptr,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
+};
+
+} // namespace weight_transform
+
+namespace input_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+    ~ITransform() = default;
+
+    virtual unsigned int get_input_rows(void) const = 0;
+    virtual unsigned int get_input_cols(void) const = 0;
+
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_batch,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
+    {
+        this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+                      wds.input_ld_row, working_space, thread_id, n_threads);
+    }
+
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
+};
+
+} // namespace input_transform
+
+namespace output_transform
+{
+class ITransform : public ITransformCommon
+{
+public:
+    ~ITransform() = default;
+
+    virtual unsigned int get_input_rows(void) const = 0;
+    virtual unsigned int get_input_cols(void) const = 0;
+
+    virtual unsigned int get_output_rows(void) const = 0;
+    virtual unsigned int get_output_cols(void) const = 0;
+
+    virtual unsigned int get_kernel_rows(void) const = 0;
+    virtual unsigned int get_kernel_cols(void) const = 0;
+
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 const WinogradDomainSpec &wds,
+                 const void               *bias,
+                 void                     *outptr,
+                 size_t                    ld_out_batch,
+                 size_t                    ld_out_row,
+                 size_t                    ld_out_col,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
+    {
+        this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+                      ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
+    }
+
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_matrix,
+                         size_t                 ld_in_row,
+                         const void            *bias,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_row,
+                         size_t                 ld_out_col,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
+};
+
+} // namespace output_transform
+
+struct WinogradImpl
+{
+    const output_transform::ITransform *output_transform = nullptr;
+    const weight_transform::ITransform *weight_transform = nullptr;
+    const input_transform::ITransform  *input_transform  = nullptr;
+    std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
+    WinogradDomainSpec                  winograd_spec;
+};
+
+/* Get pointers to Winograd transforms for the given convolution problem.
+ *
+ * Assigns to the pointers in the `dest` struct and returns true or false to
+ * indicate whether the given problem can be executed or not.
+ */
+template <typename TIn,
+          typename TWeight      = TIn,
+          typename TOut         = TIn,
+          typename TWinogradIn  = TIn,
+          typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+                        const CPUInfo *,
+                        const ConvolutionArgs &,
+                        int  max_threads,
+                        bool fast_mode,
+                        const WinogradConfig *,
+                        const arm_gemm::GemmConfig *);
+
+} // namespace winograd
+} // namespace arm_conv