From beabe3bdf47306d0940ddf2ddf52ada6903a0875 Mon Sep 17 00:00:00 2001
From: Moritz Pflanzer <moritz.pflanzer@arm.com>
Date: Thu, 31 Aug 2017 14:56:32 +0100
Subject: COMPMID-481: Add AArch64 GEMM

Change-Id: I34f94f99cb05f0eabafee13c5e623ee779b72360
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/83741
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
---
 tests/networks/AlexNetNetwork.h                    | 158 +++++++++++----------
 .../validation/fixtures/ConvolutionLayerFixture.h  |  10 +-
 2 files changed, 92 insertions(+), 76 deletions(-)

(limited to 'tests')
diff --git a/tests/networks/AlexNetNetwork.h b/tests/networks/AlexNetNetwork.h
index 1e99503792..0c06c1860f 100644
--- a/tests/networks/AlexNetNetwork.h
+++ b/tests/networks/AlexNetNetwork.h
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
 #define __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
 
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include "tests/AssetsLibrary.h"
@@ -80,55 +81,64 @@ public:
             w[7].allocator()->init(TensorInfo(TensorShape(4096U, 1000U), 1, _data_type, _fixed_point_position));
             b[7].allocator()->init(TensorInfo(TensorShape(1000U), 1, _data_type, _fixed_point_position));
 
-            w21 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
-            w22 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
-            b21 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
-            b22 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
+            w11 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+            w12 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+            b11 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
+            b12 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
 
-            w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
-            w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
-            b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
-            b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
+            w31 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+            w32 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+            b31 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
+            b32 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
 
-            w51 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
-            w52 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
-            b51 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
-            b52 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
+            w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+            w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+            b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
+            b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
         }
         else
         {
-            auto reshape = [&](unsigned int width, unsigned int height) -> TensorShape
+            auto reshape = [&](unsigned int width, unsigned int height, bool convolution_layer) -> TensorShape
             {
-                const int interleave_width = 16 / arm_compute::data_size_from_type(_data_type);
+                const bool is_optimised = std::is_same<ITensorType, ITensor>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
 
-                return TensorShape{ width * interleave_width, static_cast<unsigned int>(std::ceil(static_cast<float>(height) / interleave_width)) };
+                if(convolution_layer && is_optimised)
+                {
+                    return TensorShape{ height, width };
+                }
+                else
+                {
+                    const int interleave_width = 16 / arm_compute::data_size_from_type(_data_type);
+
+                    return TensorShape{ width * interleave_width, static_cast<unsigned int>(std::ceil(static_cast<float>(height) / interleave_width)) };
+                }
             };
 
             // Create tensor for the reshaped weights
-            w[0].allocator()->init(TensorInfo(reshape(366U, 96U), 1, _data_type, _fixed_point_position));
+            w[0].allocator()->init(TensorInfo(reshape(366U, 96U, true), 1, _data_type, _fixed_point_position));
 
             // Configure the direct convolution's weights. Direct convolution doesn't need reshape weights
             if(!_is_direct_conv)
             {
-                auto w21_tensor = std::unique_ptr<TensorType>(new TensorType());
-                auto w22_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w11_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w12_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w31_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w32_tensor = std::unique_ptr<TensorType>(new TensorType());
                 auto w41_tensor = std::unique_ptr<TensorType>(new TensorType());
                 auto w42_tensor = std::unique_ptr<TensorType>(new TensorType());
-                auto w51_tensor = std::unique_ptr<TensorType>(new TensorType());
-                auto w52_tensor = std::unique_ptr<TensorType>(new TensorType());
-                w21_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U), 1, _data_type, _fixed_point_position));
-                w22_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U), 1, _data_type, _fixed_point_position));
-                w41_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U), 1, _data_type, _fixed_point_position));
-                w42_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U), 1, _data_type, _fixed_point_position));
-                w51_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U), 1, _data_type, _fixed_point_position));
-                w52_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U), 1, _data_type, _fixed_point_position));
-                w[2].allocator()->init(TensorInfo(reshape(2560U, 384U), 1, _data_type, _fixed_point_position));
-                w21 = std::move(w21_tensor);
-                w22 = std::move(w22_tensor);
+                w11_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U, true), 1, _data_type, _fixed_point_position));
+                w12_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U, true), 1, _data_type, _fixed_point_position));
+                w31_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U, true), 1, _data_type, _fixed_point_position));
+                w32_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U, true), 1, _data_type, _fixed_point_position));
+                w41_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U, true), 1, _data_type, _fixed_point_position));
+                w42_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U, true), 1, _data_type, _fixed_point_position));
+                w[2].allocator()->init(TensorInfo(reshape(2560U, 384U, true), 1, _data_type, _fixed_point_position));
+                w11 = std::move(w11_tensor);
+                w12 = std::move(w12_tensor);
+                w31 = std::move(w31_tensor);
+                w32 = std::move(w32_tensor);
                 w41 = std::move(w41_tensor);
                 w42 = std::move(w42_tensor);
-                w51 = std::move(w51_tensor);
-                w52 = std::move(w52_tensor);
             }
             else
             {
@@ -140,20 +150,20 @@ public:
                 b[3].allocator()->init(TensorInfo(TensorShape(384U), 1, _data_type, _fixed_point_position));
                 w[4].allocator()->init(TensorInfo(TensorShape(3U, 3U, 192U, 256U), 1, _data_type, _fixed_point_position));
                 b[4].allocator()->init(TensorInfo(TensorShape(256U), 1, _data_type, _fixed_point_position));
-                w21 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
-                w22 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
-                b21 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
-                b22 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
-
-                w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
-                w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
-                b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
-                b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
-
-                w51 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
-                w52 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
-                b51 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
-                b52 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
+                w11 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+                w12 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+                b11 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
+                b12 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
+
+                w31 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+                w32 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+                b31 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
+                b32 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
+
+                w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+                w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+                b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
+                b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
             }
 
             b[5].allocator()->init(TensorInfo(TensorShape(4096U), 1, _data_type, _fixed_point_position));
@@ -162,9 +172,9 @@ public:
 
             if(_batches > 1 && std::is_same<TensorType, Tensor>::value)
             {
-                w[5].allocator()->init(TensorInfo(reshape(9216U, 4096U), 1, _data_type, _fixed_point_position));
-                w[6].allocator()->init(TensorInfo(reshape(4096U, 4096U), 1, _data_type, _fixed_point_position));
-                w[7].allocator()->init(TensorInfo(reshape(4096U, 1000U), 1, _data_type, _fixed_point_position));
+                w[5].allocator()->init(TensorInfo(reshape(9216U, 4096U, false), 1, _data_type, _fixed_point_position));
+                w[6].allocator()->init(TensorInfo(reshape(4096U, 4096U, false), 1, _data_type, _fixed_point_position));
+                w[7].allocator()->init(TensorInfo(reshape(4096U, 1000U, false), 1, _data_type, _fixed_point_position));
             }
             else
             {
@@ -230,8 +240,8 @@ public:
         norm1.configure(&act1_out, &norm1_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
         pool1.configure(&norm1_out, &pool1_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
         // Layer 2
-        conv21.configure(pool11_out.get(), w21.get(), b21.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
-        conv22.configure(pool12_out.get(), w22.get(), b22.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
+        conv21.configure(pool11_out.get(), w11.get(), b11.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
+        conv22.configure(pool12_out.get(), w12.get(), b12.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
         act2.configure(&conv2_out, &act2_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         norm2.configure(&act2_out, &norm2_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
         pool2.configure(&norm2_out, &pool2_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
@@ -240,12 +250,12 @@ public:
         conv3.configure(&pool2_out, &w[2], b2, &conv3_out, PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 384U));
         act3.configure(&conv3_out, &act3_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         // Layer 4
-        conv41.configure(act31_out.get(), w41.get(), b41.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
-        conv42.configure(act32_out.get(), w42.get(), b42.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
+        conv41.configure(act31_out.get(), w31.get(), b31.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
+        conv42.configure(act32_out.get(), w32.get(), b32.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
         act4.configure(&conv4_out, &act4_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         // Layer 5
-        conv51.configure(act41_out.get(), w51.get(), b51.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
-        conv52.configure(act42_out.get(), w52.get(), b52.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
+        conv51.configure(act41_out.get(), w41.get(), b41.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
+        conv52.configure(act42_out.get(), w42.get(), b42.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
         act5.configure(&conv5_out, &act5_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         pool5.configure(&act5_out, &pool5_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
         // Layer 6
@@ -291,12 +301,12 @@ public:
 
             if(!_is_direct_conv)
             {
-                dynamic_cast<TensorType *>(w21.get())->allocator()->allocate();
-                dynamic_cast<TensorType *>(w22.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w11.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w12.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w31.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w32.get())->allocator()->allocate();
                 dynamic_cast<TensorType *>(w41.get())->allocator()->allocate();
                 dynamic_cast<TensorType *>(w42.get())->allocator()->allocate();
-                dynamic_cast<TensorType *>(w51.get())->allocator()->allocate();
-                dynamic_cast<TensorType *>(w52.get())->allocator()->allocate();
             }
             else
             {
@@ -359,12 +369,12 @@ public:
 
             if(!_is_direct_conv)
             {
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w21.get())), 9);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w22.get())), 10);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 11);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 12);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w51.get())), 13);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w52.get())), 14);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w11.get())), 9);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w12.get())), 10);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w31.get())), 11);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w32.get())), 12);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 13);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 14);
             }
             else
             {
@@ -481,18 +491,18 @@ public:
             }
         }
 
-        w21.reset();
-        w22.reset();
-        b21.reset();
-        b21.reset();
+        w11.reset();
+        w12.reset();
+        b11.reset();
+        b11.reset();
+        w31.reset();
+        w32.reset();
+        b31.reset();
+        b32.reset();
         w41.reset();
         w42.reset();
         b41.reset();
         b42.reset();
-        w51.reset();
-        w52.reset();
-        b51.reset();
-        b52.reset();
 
         conv1_out.allocator()->free();
         act1_out.allocator()->free();
@@ -595,9 +605,9 @@ private:
 
     TensorType input{}, output{};
     std::array<TensorType, 8> w{ {} }, b{ {} };
-    std::unique_ptr<ITensorType> w21{ nullptr }, w22{ nullptr }, b21{ nullptr }, b22{ nullptr };
+    std::unique_ptr<ITensorType> w11{ nullptr }, w12{ nullptr }, b11{ nullptr }, b12{ nullptr };
+    std::unique_ptr<ITensorType> w31{ nullptr }, w32{ nullptr }, b31{ nullptr }, b32{ nullptr };
     std::unique_ptr<ITensorType> w41{ nullptr }, w42{ nullptr }, b41{ nullptr }, b42{ nullptr };
-    std::unique_ptr<ITensorType> w51{ nullptr }, w52{ nullptr }, b51{ nullptr }, b52{ nullptr };
 
     TensorType conv1_out{}, act1_out{}, norm1_out{}, pool1_out{};
     TensorType conv2_out{}, act2_out{}, pool2_out{}, norm2_out{};
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index dd2df727e9..fcaf4ef42b 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -39,6 +40,8 @@
 
 namespace arm_compute
 {
+class NEConvolutionLayer;
+
 namespace test
 {
 namespace validation
@@ -85,6 +88,8 @@ protected:
         {
             // Check if its a "fully connected" convolution
             const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
+            const bool is_optimised                   = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
+
             reshaped_weights_shape.collapse(3);
 
             if(bias_shape.total_size() > 0)
@@ -92,7 +97,7 @@ protected:
                 reshaped_weights_shape.set(0, reshaped_weights_shape.x() + 1);
             }
 
-            if(is_fully_connected_convolution)
+            if(is_fully_connected_convolution || is_optimised)
             {
                 const size_t shape_x = reshaped_weights_shape.x();
                 reshaped_weights_shape.set(0, reshaped_weights_shape.y());
@@ -138,6 +143,7 @@ protected:
         if(!reshape_weights)
         {
             const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
+            const bool is_optimised                   = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
 
             TensorShape     tmp_weights_shape(weights_shape);
             SimpleTensor<T> tmp_weights(tmp_weights_shape, data_type, 1, fixed_point_position);
@@ -149,7 +155,7 @@ protected:
 
             tmp_weights = linearise_weights(tmp_weights, &tmp_bias);
 
-            if(!is_fully_connected_convolution)
+            if(!is_fully_connected_convolution && !is_optimised)
             {
                 // Transpose with interleave
                 const int interleave_size = 16 / tmp_weights.element_size();
-- 
cgit v1.2.1