2 files changed, 10 insertions, 5 deletions
diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
index c1343044a6..9ab2e4de11 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
@@ -31,6 +31,7 @@ namespace arm_compute
 {
 class ITensor;
 class NEWinogradLayerKernel;
+
 class Winograd3x3F32
 {
 public:
@@ -68,10 +69,9 @@ public:
 
     /** Initialise the kernel
      *
-     * @param[in,out] output    Output tensor to store the result of matrix multiplication. Data type supported: F32.
-     * @param[in]     convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS .
+     * @param[in] convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS .
      */
-    void configure(ITensor *output, Winograd3x3F32 *convolver);
+    void configure(Winograd3x3F32 *convolver);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -90,7 +90,6 @@ public:
 
 protected:
     Winograd3x3F32 *_convolver;
-    ITensor        *_output;
 };
 
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
index 77707060ec..6fecf082a2 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
@@ -28,6 +28,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -69,9 +70,14 @@ public:
 private:
     MemoryGroup                     _memory_group;
     NEWinogradLayerKernel           _winograd_kernel;
-    Tensor                          _weights_workspace;
+    CPPPermute                      _permute_input;
+    CPPPermute                      _permute_weights;
+    CPPPermute                      _permute_output;
     Tensor                          _workspace;
     Tensor                          _kernel_storage;
+    Tensor                          _input_nhwc;
+    Tensor                          _output_nhwc;
+    Tensor                          _weights_hwio;
     const ITensor                  *_input;
     const ITensor                  *_weights;
     ITensor                        *_output;