From e1553374d037dbf84999258d5bc88927891770cc Mon Sep 17 00:00:00 2001
From: Anthony Barbier <anthony.barbier@arm.com>
Date: Mon, 16 Jul 2018 18:53:52 +0100
Subject: COMPMID-1357: Stop passing around raw pointers in
 NEWinogradConvolution

First step to allow us to enable the memory manager in this function

Change-Id: Ic42fdac4c74cd21973c71130b59883e4a87d3dca
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140167
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 .../kernels/NEWinogradConvolutionLayerKernel.h     | 163 +++------------------
 .../NEON/functions/NEWinogradConvolutionLayer.h    |   2 +-
 docs/00_introduction.dox                           |   2 +-
 .../kernels/NEWinogradConvolutionLayerKernel.cpp   | 125 ++--------------
 .../NEON/functions/NEWinogradConvolutionLayer.cpp  |  84 ++++++-----
 5 files changed, 81 insertions(+), 295 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 68c133ee37..9cdd69a70a 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -74,7 +74,7 @@ public:
      * @param[in]  matrix_stride Stride between output matrices.
      */
     virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
-                           const PaddingType padding, T *const output, const int matrix_stride) = 0;
+                           const PaddingType padding, ITensor *output, const int matrix_stride) = 0;
 
     /** Destructor */
     virtual ~INEWinogradLayerTransformInputKernel()
@@ -152,7 +152,7 @@ public:
         const int         num_cols,
         const int         num_channels,
         const PaddingType padding,
-        T *const          output,
+        ITensor          *output,
         const int         matrix_stride) override;
 
     // Inherited methods overridden:
@@ -181,7 +181,7 @@ private:
     int            _num_cols;      /**< Number of columns in input tensor. */
     int            _num_channels;  /**< Number of channels in input tensor. */
     PaddingType    _padding;       /**< Padding type. */
-    T             *_output;        /**< Base of output matrices. */
+    ITensor       *_output;        /**< Base of output matrices. */
     int            _matrix_stride; /**< Stride between output matrices. */
 };
 
@@ -236,9 +236,9 @@ public:
      */
     virtual void configure(
         const ITensor *biases,
-        const T *const output_workingspace,
+        const ITensor *output_workingspace,
         const int      matrix_stride,
-        ITensor *const output_nhwc,
+        ITensor       *output_nhwc,
         const int      num_batches,
         const int      num_rows,
         const int      num_cols,
@@ -318,9 +318,9 @@ public:
      */
     void configure(
         const ITensor *biases,
-        const T *const output_workingspace,
+        const ITensor *output_workingspace,
         const int      matrix_stride,
-        ITensor *const output_nhwc,
+        ITensor       *output_nhwc,
         const int      num_batches,
         const int      num_rows,
         const int      num_cols,
@@ -345,7 +345,7 @@ private:
     using OutputTransform = typename WinogradBase::template OutputTransform<T>;
 
     const ITensor *_biases;
-    const T       *_output_workspace;
+    const ITensor *_output_workspace;
     int            _matrix_stride;
     int            _matrix_row_stride;
     ITensor       *_output_nhwc;
@@ -379,14 +379,14 @@ public:
 
     /** Configure the weights transform kernel.
      *
-     * @param[in] weights_hwio        Pointer to the weights tensor
-     * @param[in] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in] matrix_stride       Stride across matrices in the output workspace.
-     * @param[in] num_output_channels Number of filters.
-     * @param[in] num_input_channels  Number of channels in each filter.
+     * @param[in]  weights_hwio        Pointer to the weights tensor
+     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
+     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
+     * @param[in]  num_output_channels Number of filters.
+     * @param[in]  num_input_channels  Number of channels in each filter.
      */
 
-    virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
+    virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
 
     virtual ~INEWinogradLayerTransformWeightsKernel()
     {
@@ -428,7 +428,7 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
 
     // Inherited methods overridden:
-    void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
+    void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
     unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
     int get_matrix_stride(const KernelShape &kernel_shape) const override;
     void run(const Window &window, const ThreadInfo &info) override;
@@ -440,147 +440,26 @@ private:
     using WeightsTransform = typename WinogradBase::template WeightsTransform<T>;
 
     const ITensor *_weights_hwio;
-    T             *_output;
+    ITensor       *_output;
     int            _matrix_stride;
     int            _num_output_channels;
     int            _num_input_channels;
 };
 
-/** Interface for the NEON kernel to perform Winograd. */
-template <typename TIn, typename TOut>
-class INEWinogradLayerBatchedGEMMKernel : public INEKernel
-{
-public:
-    /** Get the number of GEMMs to compute
-     */
-    virtual unsigned int get_number_gemms() const = 0;
-    /** Initialise the kernel
-     *
-     * @param[in]  n_gemms         Number of GEMMs to compute.
-     * @param[in]  M               in_shape.num_batches * tile_rows * tile_cols.
-     * @param[in]  K               Number of channels in the input tensor.
-     * @param[in]  N               Number of channels in the output tensor.
-     * @param[in]  a_matrix_stride Stride between input matrices.
-     * @param[in]  a_row_stride    Row stride inside input matrix.
-     * @param[in]  b_matrix_stride Stride between weights matrices.
-     * @param[in]  b_row_stride    Row stride inside the weights matrix.
-     * @param[in]  c_matrix_stride Stride between output matrices.
-     * @param[in]  c_row_stride    Row stride inside the output matrix.
-     * @param[out] a_ptr           Input workspace.
-     * @param[out] b_ptr           Kernel workspace.
-     * @param[out] c_ptr           Output workspace.
-     */
-    virtual void configure(
-        const unsigned int n_gemms,
-        const int M, const int K, const int N,
-        const int        a_matrix_stride,
-        const int        a_row_stride,
-        const int        b_matrix_stride,
-        const int        b_row_stride,
-        const int        c_matrix_stride,
-        const int        c_row_stride,
-        const TIn *const a_ptr,
-        const TIn *const b_ptr,
-        TOut *const      c_ptr) = 0;
-
-    /** Get the number of tiles per row
-     */
-    virtual int get_output_tile_rows() const = 0;
-    /** Get the number of tiles per columns
-     */
-    virtual int get_output_tile_cols() const = 0;
-    /** Get the number of blocks
-     */
-    virtual int get_number_blocks() const = 0;
-};
-
 /** NEON kernel to perform Winograd. */
 template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel<TIn, TOut>
+class NEWinogradLayerConfiguration
 {
 public:
     /** Winograd base kernel */
     using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
     /** Winograd convolution kernel */
-    using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
-    /** Winograd batched blocked GEMM operator */
-    using MultiGEMM = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, TIn, TOut>;
-
-    const char *name() const override
-    {
-        return "NEWinogradLayerBatchedGEMMKernel";
-    }
-    /** Constructor */
-    NEWinogradLayerBatchedGEMMKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerBatchedGEMMKernel(const NEWinogradLayerBatchedGEMMKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerBatchedGEMMKernel &operator=(const NEWinogradLayerBatchedGEMMKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default;
-    /** Default destructor. */
-    ~NEWinogradLayerBatchedGEMMKernel() = default;
-
-    // Inherited methods overridden:
-
-    unsigned int get_number_gemms() const override;
-    int          get_output_tile_rows() const override;
-    int          get_output_tile_cols() const override;
-    int          get_number_blocks() const override;
-
-    /** Initialise the kernel
-     *
-     * @param[in]  n_gemms         Number of GEMMs to compute.
-     * @param[in]  M               in_shape.num_batches * tile_rows * tile_cols.
-     * @param[in]  K               Number of channels in the input tensor.
-     * @param[in]  N               Number of channels in the output tensor.
-     * @param[in]  a_matrix_stride Stride between input matrices.
-     * @param[in]  a_row_stride    Row stride inside input matrix.
-     * @param[in]  b_matrix_stride Stride between weights matrices.
-     * @param[in]  b_row_stride    Row stride inside the weights matrix.
-     * @param[in]  c_matrix_stride Stride between output matrices.
-     * @param[in]  c_row_stride    Row stride inside the output matrix.
-     * @param[out] a_ptr           Input workspace.
-     * @param[out] b_ptr           Kernel workspace.
-     * @param[out] c_ptr           Output workspace.
-     */
-    void configure(
-        const unsigned int n_gemms,
-        const int M, const int K, const int N,
-        const int        a_matrix_stride,
-        const int        a_row_stride,
-        const int        b_matrix_stride,
-        const int        b_row_stride,
-        const int        c_matrix_stride,
-        const int        c_row_stride,
-        const TIn *const a_ptr,
-        const TIn *const b_ptr,
-        TOut *const      c_ptr) override;
 
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerBatchedGEMMKernel.
-     *
-     * @param[in]  a         First input tensor  (Matrix or Vector A). Data types supported: F32
-     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
 
-private:
-    static const int           _output_tile_rows = OutputTileRows;
-    static const int           _output_tile_cols = OutputTileCols;
-    std::unique_ptr<MultiGEMM> _gemms;
+    using TransformInputKernel   = NEWinogradLayerTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
+    using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
+    using TransformOutputKernel  = NEWinogradLayerTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
 };
 
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index d897ae00e7..384fbf893b 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -43,7 +43,7 @@ class ITensor;
  * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
  * -# @ref NEWinogradLayerTransformInputKernel
  * -# @ref NEWinogradLayerTransformOutputKernel
- * -# @ref NEWinogradLayerBatchedGEMMKernel
+ * -# @ref NEGEMMAssemblyDispatchF32
  * -# @ref CPPPermute (three times: weights, input and output)
  *
  * @note  Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 7f693c901c..4afd6d85f4 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -315,7 +315,7 @@ v18.02 Public major release
     - @ref NEWinogradLayerTransformInputKernel / NEWinogradLayer
     - @ref NEWinogradLayerTransformOutputKernel / NEWinogradLayer
     - @ref NEWinogradLayerTransformWeightsKernel / NEWinogradLayer
-    - Renamed NEWinogradLayerKernel into @ref NEWinogradLayerBatchedGEMMKernel
+    - Renamed NEWinogradLayerKernel into NEWinogradLayerBatchedGEMMKernel
  - New GLES kernels / functions:
     - @ref GCTensorShiftKernel / @ref GCTensorShift
 
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index 50e69a8adf..b295a0c685 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -40,38 +40,6 @@ namespace arm_compute
 
 namespace
 {
-Status validate_arguments_winograd_gemm(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta,
-                                        const GEMMInfo &gemm_info = GEMMInfo())
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(b);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    if(c != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B");
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != a->num_dimensions());
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_UNUSED(alpha, beta);
-    return Status{};
-}
-
 Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
@@ -232,78 +200,6 @@ std::pair<Status, Window> validate_and_configure_window_winograd_output_trans(IT
     return std::make_pair(err, win);
 }
 } // namespace
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
-    : _gemms()
-{
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const unsigned int n_gemms,
-    const int M, const int K, const int N,
-    const int        a_matrix_stride,
-    const int        a_row_stride,
-    const int        b_matrix_stride,
-    const int        b_row_stride,
-    const int        c_matrix_stride,
-    const int        c_row_stride,
-    const TIn *const a_ptr,
-    const TIn *const b_ptr,
-    TOut *const      c_ptr)
-{
-    _gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
-    Window win;
-    auto   win_last = _gemms->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    const size_t first_gemm = window.x().start();
-    const size_t last_gemm  = window.x().end();
-    _gemms->run(first_gemm, last_gemm);
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
-{
-    return WinogradBase::N_GEMMS;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
-{
-    return _output_tile_rows;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
-{
-    return _output_tile_cols;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
-{
-    return WinogradConv::N_BLOCK;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c,
-                                                                                                                     const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_gemm(a, b, c, output, alpha, beta, gemm_info));
-    return Status{};
-}
-
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>;
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
 
 // Weights transform
 
@@ -332,7 +228,7 @@ int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, Ker
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
     const ITensor *weights_hwio,
-    T *const       output,
+    ITensor       *output,
     const int      matrix_stride,       /** Stride across matrices in the output. */
     const int      num_output_channels, /** Number of filters. */
     const int      num_input_channels)  /** Number of channels in each filter. */
@@ -344,7 +240,7 @@ void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, Ke
     _num_input_channels  = num_input_channels;
 
     const int        matrix_row_stride = roundup(num_output_channels, WinogradConv::N_BLOCK);
-    WeightsTransform transform(nullptr, output, matrix_stride, matrix_row_stride, num_output_channels, num_input_channels);
+    WeightsTransform transform(nullptr, nullptr, matrix_stride, matrix_row_stride, num_output_channels, num_input_channels);
     Window           win;
     auto             win_last = transform.get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
@@ -358,7 +254,7 @@ void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, Ke
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 
     const int        matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
-    WeightsTransform transform(reinterpret_cast<T *>(_weights_hwio->buffer()), _output, _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels);
+    WeightsTransform transform(reinterpret_cast<T *>(_weights_hwio->buffer()), reinterpret_cast<T *>(_output->buffer()), _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels);
     const size_t     fst = window.x().start();
     const size_t     lst = window.x().end();
     transform.run(fst, lst);
@@ -423,7 +319,7 @@ void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, Kern
     const int         num_cols,      /* Number of columns in input tensor. */
     const int         num_channels,  /* Number of channels in input tensor. */
     const PaddingType padding,       /* Padding type. */
-    T *const          output,        /* Base of output matrices. */
+    ITensor          *output,        /* Base of output matrices. */
     const int         matrix_stride) /* Stride between output matrices. */
 {
     _input_nhwc    = input_nhwc;
@@ -434,7 +330,7 @@ void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, Kern
     _padding       = padding;
     _output        = output;
     _matrix_stride = matrix_stride;
-    InputTransform transform(nullptr, num_batches, num_rows, num_cols, num_channels, padding, output, matrix_stride, num_channels);
+    InputTransform transform(nullptr, num_batches, num_rows, num_cols, num_channels, padding, nullptr, matrix_stride, num_channels);
     Window         win;
     auto           win_last = transform.get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
@@ -447,7 +343,8 @@ void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, Kern
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 
-    InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, _output, _matrix_stride, _num_channels);
+    InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, reinterpret_cast<T *>(_output->buffer()),
+                                   _matrix_stride, _num_channels);
 
     // The code below cannot be moved to configure because biases hasn't been allocated at that point
     const size_t fst = window.x().start();
@@ -511,9 +408,9 @@ Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTile
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
     const ITensor *biases,
-    const T *const output_workingspace,
+    const ITensor *output_workingspace,
     const int      matrix_stride,
-    ITensor *const output_nhwc,
+    ITensor       *output_nhwc,
     const int      num_batches,
     const int      num_rows,
     const int      num_cols,
@@ -529,7 +426,7 @@ void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, Ker
     _num_cols          = num_cols;
     _num_channels      = num_channels;
     // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
-    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, nullptr, _num_batches, _num_rows, _num_cols, _num_channels);
+    OutputTransform output_transform(nullptr, _matrix_stride, _matrix_row_stride, nullptr, nullptr, _num_batches, _num_rows, _num_cols, _num_channels);
 
     Window win;
     auto   win_last = output_transform.get_window();
@@ -548,7 +445,7 @@ void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, Ker
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
 
-    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
+    OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride,
                                      (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), reinterpret_cast<T *>(_output_nhwc->buffer()),
                                      _num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T));
 
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 29da0803a3..a71eade9a1 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -155,29 +155,32 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
         {
             if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
             {
-                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
-                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
-                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
-                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
-                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+                using config             = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
+                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+                n_gemms                  = config::WinogradBase::N_GEMMS;
+                N_BLOCK                  = config::WinogradConv::N_BLOCK;
             }
             else
             {
-                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
-                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
-                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
-                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
-                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+                using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
+                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+                n_gemms                  = config::WinogradBase::N_GEMMS;
+                N_BLOCK                  = config::WinogradConv::N_BLOCK;
             }
             break;
         }
         case 5:
         {
-            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
-            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
-            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
-            n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
-            N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+            using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+            n_gemms                  = config::WinogradBase::N_GEMMS;
+            N_BLOCK                  = config::WinogradConv::N_BLOCK;
             break;
         }
         default:
@@ -195,21 +198,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
     const int out_channels = output->info()->dimension(channel_idx);
 
     const Tensor4DShape in_shape(internal_get_input_shape(input));
+    const DataType      data_type      = input->info()->data_type();
     const size_t        data_type_size = input->info()->element_size();
     // Get the memory required to instantiate a new Winograd operator.
     constexpr size_t storage_alignment = 64;
 
     // Kernel Storage
     const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
-                                                                                         in_channels) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                                                                         in_channels)
+                                       * data_type_size
+                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
 
     // Input storage
     const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
-                                                                                     use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                                                                     use_same_padding)
+                                      * data_type_size
+                                      + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
 
     // Output storage
     const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
-                                                                                        use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                                                                        use_same_padding)
+                                       * data_type_size
+                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
     ;
     const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
     const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -229,28 +239,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
     const int output_matrix_row_stride = kernel_matrix_row_stride;
 
     TensorShape a_shape(k, m, 1, n_gemms);
-    Strides     a_strides(element_size_from_data_type(DataType::F32));
+    Strides     a_strides(data_type_size);
     a_strides.set(1, a_strides[0] * k);
+    //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
     a_strides.set(2, 0);
-    //a_strides.set(2, element_size_from_data_type(DataType::F32) * input_matrix_stride / n_gemms);
-    a_strides.set(3, element_size_from_data_type(DataType::F32) * input_matrix_stride);
+    a_strides.set(3, data_type_size * input_matrix_stride);
 
     TensorShape b_shape(n, k, n_gemms);
-    Strides     b_strides(element_size_from_data_type(DataType::F32));
-    b_strides.set(1, element_size_from_data_type(DataType::F32) * kernel_matrix_row_stride);
-    b_strides.set(2, element_size_from_data_type(DataType::F32) * kernel_matrix_stride);
+    Strides     b_strides(data_type_size);
+    b_strides.set(1, data_type_size * kernel_matrix_row_stride);
+    b_strides.set(2, data_type_size * kernel_matrix_stride);
 
     TensorShape d_shape(n, m, 1, n_gemms);
-    Strides     d_strides(element_size_from_data_type(DataType::F32));
-    d_strides.set(1, element_size_from_data_type(DataType::F32) * output_matrix_row_stride);
+    Strides     d_strides(data_type_size);
+    d_strides.set(1, data_type_size * output_matrix_row_stride);
+    //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
     d_strides.set(2, 0);
-    //d_strides.set(2, element_size_from_data_type(DataType::F32) * output_matrix_stride / n_gemms);
-    d_strides.set(3, element_size_from_data_type(DataType::F32) * output_matrix_stride);
+    d_strides.set(3, data_type_size * output_matrix_stride);
 
     TensorInfo a_info, b_info, d_info;
-    a_info.init(a_shape, 1, DataType::F32, a_strides, 0, input_storage_size);
-    b_info.init(b_shape, 1, DataType::F32, b_strides, 0, kernel_storage_size);
-    d_info.init(d_shape, 1, DataType::F32, d_strides, 0, output_storage_size);
+    a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
+    b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
+    d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
 
     _input_workspace.allocator()->init(a_info, storage_alignment);
     _input_workspace.allocator()->allocate();
@@ -276,12 +286,12 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
         _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
         _input_nhwc.allocator()->allocate();
         transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+                                          &_input_workspace, input_matrix_stride);
     }
     else
     {
         transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+                                          &_input_workspace, input_matrix_stride);
     }
 
     // Configure WeightsTransform
@@ -290,14 +300,14 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
         // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
         _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
 
-        transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
     }
     else
     {
         // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
         _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
 
-        transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
     }
     _weights_hwio.allocator()->allocate();
 
@@ -306,13 +316,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
 
     if(data_layout == DataLayout::NCHW)
     {
-        transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+        transform_output_kernel->configure(biases, &_output_workspace,
                                            output_matrix_stride, &_output_nhwc,
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
     else
     {
-        transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+        transform_output_kernel->configure(biases, &_output_workspace,
                                            output_matrix_stride, _output,
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
-- 
cgit v1.2.1