35 files changed, 2827 insertions, 225 deletions
diff --git a/SConscript b/SConscript
index 00c366e890..26bcc4a6ec 100644
--- a/SConscript
+++ b/SConscript
@@ -169,6 +169,9 @@ if env['neon']:
     core_files += Glob('src/core/NEON/*.cpp')
     core_files += Glob('src/core/NEON/kernels/*.cpp')
 
+    if "arm64-v8" in env['arch']:
+        core_files += Glob('src/core/NEON/kernels/arm64/*.cpp')
+
     runtime_files += Glob('src/runtime/NEON/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
 
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index adad00f8c4..cff49db0ac 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -48,11 +48,18 @@ enum class CPUTarget
     A75_DOT = (A75 | DOT),
 };
 
+struct CPUInfo
+{
+    CPUTarget CPU{ CPUTarget::INTRINSICS };
+    int       L1_size{ 0 };
+    int       L2_size{ 0 };
+};
+
 struct ThreadInfo
 {
-    int       thread_id{ 0 };
-    int       num_threads{ 1 };
-    CPUTarget cpu{ CPUTarget::INTRINSICS };
+    int     thread_id{ 0 };
+    int     num_threads{ 1 };
+    CPUInfo cpu_info{};
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CPP_TYPES_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
new file mode 100644
index 0000000000..e298bfdebd
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__
+#define __ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AssemblyBase/armv7a NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMAssemblyBaseKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMAssemblyBaseKernel()
+        : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _transform_0(true), _transform_1(true)
+    {
+    }
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
+
+    virtual ~NEGEMMAssemblyBaseKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * The computed function is C = a * AxB + b * C.
+     *
+     * @param[in]     input0      Input tensor containing the Matrix A. Data types supported: F32
+     * @param[in]     input1      Input tensor containing the Matrix B. Data types supported: same as @p input0
+     * @param[in,out] output      Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
+     * @param[out]    workspace   Space for intermediate results.
+     * @param[in]     alpha       Weight of the matrix product
+     * @param[in]     beta        Weight of the accumulation.
+     * @param[in]     transform_0 If true the kernel will transform @p input0 prior to the multiplication.
+     * @param[in]     transform_1 If true the kernel will transform @p input1 prior to the multiplication.
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool transform_0 = true, bool transform_1 = true)
+    {
+        internal_configure(input0, input1, output, workspace, alpha, beta, transform_0, transform_1);
+    }
+
+protected:
+    virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) = 0;
+
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    ITensor       *_workspace;
+    float          _alpha;
+    float          _beta;
+    bool           _transform_0;
+    bool           _transform_1;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
new file mode 100644
index 0000000000..77431d2bc8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
new file mode 100644
index 0000000000..fa1d6e37a9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+// Macro to use in assembler to get a preload.  Needed because of various
+// workarounds needed to get working preload behaviour.
+//
+// Code using these macros needs to clobber x20 and x21 as they might be
+// used by the workaround.
+
+#define ASM_PREFETCH(address)    "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address)  "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+
+#else
+
+#define ASM_PREFETCH(address)     "PLD " address "\n"
+#define ASM_PREFETCHW(address)    "PLDW " address "\n"
+
+#endif
+
+/*
+ * Do some prefetches.
+ */
+template <typename T>
+static inline void prefetch_6x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+        ASM_PREFETCH("[%[pfp], #320]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_5x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_4x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_3x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_2x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_1x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
new file mode 100644
index 0000000000..00974436ff
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+// Abstract class for a GEMM function
+template<typename To, typename Tr>
+class GemmCommon {
+public:
+    virtual size_t get_working_size() const = 0;
+    virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space = NULL) const = 0;
+    virtual ~GemmCommon() { }
+};
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
new file mode 100644
index 0000000000..a186d88355
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "gemm_common.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+#include "mergeresults.hpp"
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND	64
+#define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    const unsigned int M;
+    const unsigned int N;
+    const unsigned int K;
+
+    const bool trA;
+    const bool trB;
+
+    const strategy strat;
+
+    unsigned int k_block = 0;
+    unsigned int x_block = 0;
+    unsigned int Mround = 0;
+
+    size_t get_a_working_size() const {
+        return ROUND_UP(sizeof(Toi) * k_block * Mround);
+    }
+
+    size_t get_b_working_size() const {
+        return ROUND_UP(sizeof(Toi) * x_block * k_block);
+    }
+
+    size_t get_c_working_size() const {
+        return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
+    }
+
+public:
+    size_t get_working_size() const override {
+        return get_a_working_size() + get_b_working_size() + get_c_working_size();
+    }
+
+    GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
+        const unsigned int L1_size = ci->L1_size;
+        const unsigned int L2_size = ci->L2_size;
+
+        // Work out blocking parameters
+        // k_block: Each iteration will consume (out_width + out_height)
+        // operands - so how many iterations will fill the L1?
+        k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));
+
+        // Needs to be a multiple of the K unroll level.
+        k_block /= strat.k_unroll;
+        k_block *= strat.k_unroll;
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        int num_k_blocks = (K + (k_block - 1)) / k_block;
+
+        // So divide the space equally into that many blocks.
+        k_block = (K + num_k_blocks - 1) / num_k_blocks;
+
+        // And round UP to the K unroll level required.
+        k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
+        k_block *= strat.k_unroll;
+
+        // x_block: Work out how many rows (of length k_block) will fit in the L2
+        x_block = L2_size / (sizeof(Toi) * k_block);
+
+        // Needs to be a multiple of the kernel output width.
+        x_block /= strat.out_width;
+        x_block *= strat.out_width;
+
+        // And tune to the presented problem size.
+        int num_x_blocks = (N + (x_block - 1)) / x_block;
+        x_block = (N + num_x_blocks - 1) / num_x_blocks;
+
+        x_block = (x_block + strat.out_width - 1) / strat.out_width;
+        x_block *= strat.out_width;
+
+        // Work out the rounded size of M - needed for some buffers.
+        Mround = (M + (strat.out_height - 1)) / strat.out_height;
+        Mround *= strat.out_height;
+    }
+
+    // Actually execute the GEMM.
+    void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
+        profiler prof;
+
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+        intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
+        size_t diff = 0;
+
+        if (working_space_int & 0xF) {
+            diff = 0x10 - (working_space_int & 0xF);
+        }
+
+        // TODO: Multithreaded implementations could share the burden of transforming these blocks.
+        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
+        Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
+        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
+
+        for (unsigned int k0=0; k0<K; k0 += k_block) {
+            unsigned int kmax = k0 + k_block;
+            if (kmax > K) kmax = K;
+
+            // Figure out how many "K" the kernel will actually process.
+            int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
+            kern_k *= strat.k_unroll;
+
+            prof(PROFILE_PREPA, [&](void) {
+                if (trA ^ strategy::A_transpose) {
+                    Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
+                } else {
+                    Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
+                }
+            });
+
+            for (unsigned int x0=0; x0<N; x0 += x_block) {
+                unsigned int xmax = x0 + x_block;
+                if (xmax > N) xmax = N;
+
+                int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
+
+                prof(PROFILE_PREPB, [&](void) {
+                    if (trB ^ strategy::B_transpose) {
+                        Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
+                    } else {
+                        Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
+                    }
+                });
+
+                for (unsigned int y=0; y<M; y+=strat.out_height) {
+                    unsigned int ymax = y + strat.out_height;
+                    if (ymax > M) ymax = M;
+
+                    prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
+                    prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
+                }
+            }
+        }
+    }
+};
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
new file mode 100644
index 0000000000..e229e215ef
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Actual kernel implementations
+#include "a64_sgemm_12x8/generic.hpp"
+#include "a64_sgemm_12x8/a53.hpp"
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_12x8 {
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 12;
+    static const int out_height = 8;
+    static const int k_unroll = 1;
+
+    kern_type kernel{nullptr};
+
+    sgemm_12x8(const CPUInfo *ci) {
+        kernel = a64_sgemm_asimd_12x8;
+        if (ci->CPU == CPUTarget::A53) {
+            kernel = a64_sgemm_asimd_12x8_a53;
+        }
+    }
+};
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
new file mode 100644
index 0000000000..e58ce66825
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "nop\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+                "ins	%[a0].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+                "ins	%[a1].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+                "nop\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+                "nop\n"
+                "nop\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+                "nop\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+
+                "nop\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+                "bne	1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz	%[oddk], 2f\n"
+
+                // Detached final iteration. (even K)
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                "nop\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+        }
+    }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
new file mode 100644
index 0000000000..082c200646
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <arm_neon.h>
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla  	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+        }
+    }
+}
+
+inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
new file mode 100644
index 0000000000..6731480fca
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template<unsigned int width, unsigned int height, typename Tin, typename Tout>
+void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) {
+    int full_y_blocks = (ymax - y0) / height;
+    int y_remainder = (ymax - y0) % height;
+    int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
+
+    int full_x_blocks = (xmax - x0) / width;
+    int x_remainder = (xmax - x0) % width;
+    int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
+
+    for (int y_block = 0; y_block < y_blocks; y_block++) {
+        int ybase = y0 + (y_block * height);
+
+        int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
+
+        for (int x_block = 0; x_block < x_blocks; x_block++) {
+            int xbase = x0 + (x_block * width);
+
+            int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
+
+            for (int row=0; row < fill_rows; row++) {
+                for (int col=0; col < fill_cols; col++) {
+                    Tout &p = out[(ybase + row) * ldc + xbase + col];
+
+                    p = (p * alpha) + (beta * in[row * width + col]);
+                }
+            }
+
+            in += (width * height);
+        }
+    }
+}
+
+#include "merges/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000000..f2c5fd86b9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../asmlib.hpp"
+
+template<>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for (int y=y0; y<ymax; y+=8) {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for (int i=x0; i<xmax; i+=12) {
+            float dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if ((y+7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                    default:
+                        break;
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if ((i+11) >= xmax) {
+                for (int xi=0; xi<12; xi++) {
+                    if ((i+xi) < xmax) {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            } else {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile (
+                    // Rows 0-1
+                    "LDP	q16, q17, [%[outptr0]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr0], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr1]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr1], #32]\n"
+                    ASM_PREFETCH("[%[inptr], #768]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr]]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #32]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #64]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #832]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr0]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr0]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #896]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr1]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr1]], #16\n"
+
+                    // Rows 2-3
+                    "LDP	q16, q17, [%[outptr2]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr2], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr3]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr3], #32]\n"
+                    ASM_PREFETCH("[%[inptr], #960]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr], #96]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #128]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #160]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #1024]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr2]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr2]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #1088]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr3]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr3]], #16\n"
+
+                    // Rows 4-5
+                    ASM_PREFETCH("[%[outptr0], #80]")
+                    "LDP	q16, q17, [%[outptr4]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr4], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr5]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr5], #32]\n"
+                    ASM_PREFETCH("[%[outptr1], #80]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr], #192]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #224]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #256]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr2], #80]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr4]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr4]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr3], #80]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr5]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr5]], #16\n"
+
+                    // Rows 6-7
+                    ASM_PREFETCH("[%[outptr4], #80]")
+                    "LDP	q16, q17, [%[outptr6]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr6], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr7]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr7], #32]\n"
+                    ASM_PREFETCH("[%[outptr5], #80]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr], #288]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #320]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #352]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr6], #128]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr6]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr6]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr7], #128]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr7]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr7]], #16\n"
+                    "ADD	%[inptr], %[inptr], #384\n"
+                : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                  [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                  [inptr] "+r" (inptr)
+                : [av] "w" (av), [bv] "w" (bv)
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", "q18", "q19", "q20", "q21"
+                );
+            }
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
new file mode 100644
index 0000000000..4f23333ef1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "a64_merge_float_12x8.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
new file mode 100644
index 0000000000..d2f8ba923a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef CYCLE_PROFILING
+
+#include "../perf.h"
+
+class profiler {
+private:
+    static const int maxevents = 10000;
+    unsigned long times[maxevents];
+    int events[maxevents];
+    int currentevent;
+    int countfd;
+
+public:
+    profiler() {
+        currentevent=0;
+        countfd=open_cycle_counter();
+    }
+
+    ~profiler() {
+        close(countfd);
+        int tots[5];
+        unsigned long counts[5];
+        const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+
+        for (int i=1; i<5; i++) {
+            tots[i] = 0;
+            counts[i] = 0;
+        }
+
+        printf("Profiled events:\n");
+        for (int i=0; i<currentevent; i++) {
+            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+            tots[events[i]]++;
+            counts[events[i]] += times[i];
+        }
+
+        printf("%20s  %9s %9s %9s\n", "", "Events", "Total", "Average");
+        for (int i=1; i<5; i++) {
+            printf("%20s: %9d %9ld %9ld\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i]);
+        }
+    }
+
+    template <typename T>
+    void operator() (int i, T func) {
+        if (currentevent==maxevents) {
+            func();
+        } else {
+            start_counter(countfd);
+            func();
+            long long cycs = stop_counter(countfd);
+            events[currentevent] = i;
+            times[currentevent++] = cycs;
+        }
+    }
+};
+
+#else
+
+class profiler {
+public:
+    template <typename T>
+    void operator() (int i, T func) {
+        func();
+    }
+};
+
+#endif
+
+#define PROFILE_PREPA 1
+#define PROFILE_PREPB 2
+#define PROFILE_KERNEL 3
+#define PROFILE_MERGE 4
diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp
new file mode 100644
index 0000000000..717506f54c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transform.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
+struct TransformImpl {
+    template <typename TOut, typename TIn>
+    static void Transform(TOut* out, const TIn* const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax) {
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders = (ymax - y0) % IntBy;
+        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders = (xmax - x0) % BlockBy;
+        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
+            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            int blank_rows = IntBy - fill_rows;
+
+            int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
+                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                int blank_cols = BlockBy - fill_cols;
+
+                int x_base = x0 + (x_block * BlockBy);
+
+                for (int row = 0; row < fill_rows; row++) {
+                    for (int col = 0; col < fill_cols; col++) {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if (Transposed) {
+                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+                        } else {
+                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for (int col=0; col < blank_cols; col++) {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                for (int row = 0; row < blank_rows; row++) {
+                    for (int col=0; col < (fill_cols + blank_cols); col++) {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    static inline void Transform(T* out, const T* const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax) {
+        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+    }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
+void Transform(
+  TOut* out, const TIn* const in, const int stride,
+  const int k0, const int kmax, const int x0, const int xmax
+) {
+  // Redirect to a specialised implementation predicated on argument size.
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+    out, in, stride, k0, kmax, x0, xmax
+  );
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
new file mode 100644
index 0000000000..6317424598
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../asmlib.hpp"
+
+#include <arm_neon.h>
+
+template<>
+template<typename T>
+void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint32_t *outptr = (uint32_t *)out;
+    const uint32_t *inptr = (uint32_t *)in;
+
+    uint32_t zerobuff[8];
+
+    for (int y=y0; y<ymax; y+=8) {
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+        const uint32_t *inptr6 = inptr5 + ldin;
+        const uint32_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                    default:
+                        break;
+                }
+            }
+
+            __asm __volatile (
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
+                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
+                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
+                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
+                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+                "LDP        q8, q9, [%[inptr4]], #32\n"
+                "LDP        q10, q11, [%[inptr5]], #32\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDP        q12, q13, [%[inptr6]], #32\n"
+                "ZIP1       v18.4s, v8.4s, v12.4s\n"
+                "LDP        q14, q15, [%[inptr7]], #32\n"
+                "ZIP1       v19.4s, v10.4s, v14.4s\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+                ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2       v16.4s, v0.4s, v4.4s\n"
+                "ZIP2       v17.4s, v2.4s, v6.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+                "ZIP2       v18.4s, v8.4s, v12.4s\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2       v19.4s, v10.4s, v14.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP1       v16.4s, v1.4s, v5.4s\n"
+                "ZIP1       v17.4s, v3.4s, v7.4s\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "STP        q20, q21, [%[outptr]], #32\n" // Third element
+
+                "ZIP1       v18.4s, v9.4s, v13.4s\n"
+                "ZIP1       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2       v16.4s, v1.4s, v5.4s\n"
+                "ZIP2       v17.4s, v3.4s, v7.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
+
+                "ZIP2       v18.4s, v9.4s, v13.4s\n"
+                "ZIP2       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
+
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif  // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
new file mode 100644
index 0000000000..3cf6b41ffa
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+//#include "a32_interleave_6way_32bit.hpp"
+//#include "a32_transpose_interleave_8way_32bit.hpp"
+//#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
+//#include "a64_interleave_8way_half_to_float.hpp"
+//#include "a64_transpose_interleave_12way_16bit.hpp"
+//#include "a64_transpose_interleave_12way_half_to_float.hpp"
+//#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "transpose_interleave_common.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000000..882da9c831
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon {
+  // Override the moveblock_1xY methods to improve performance
+  static inline void moveblock_1x1(const TIn *&in0, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+  }
+
+  static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in1++);
+    }
+  }
+
+  static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in1++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in2++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in3++);
+    }
+  }
+
+  static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
+    const auto ldin = stride;
+
+    TOut *outarray = out;
+    const TIn *inarray = in;
+    TOut *outptr_base = outarray;
+    const TIn *inptr_base = inarray + x0 + (k0 * ldin);
+    int ldout = (kmax - k0) * IntBy;
+
+    int k=(kmax-k0);
+    for ( ; k>3; k-=4) {
+        TOut *outptr = outptr_base;
+        const TIn *inptr = inptr_base;
+        const TIn *inptr1 = inptr + ldin;
+        const TIn *inptr2 = inptr1 + ldin;
+        const TIn *inptr3 = inptr2 + ldin;
+
+        prefetch_3x(inptr);
+        prefetch_3x(inptr1);
+        prefetch_3x(inptr2);
+        prefetch_3x(inptr3);
+
+        outptr_base += IntBy * 4;
+        inptr_base += ldin * 4;
+
+        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+            moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+            outptr += ldout;
+        }
+    }
+
+    if (k) {
+        TOut *outptr = outptr_base;
+        const TIn *inptr = inptr_base;
+        const TIn *inptr1 = inptr + ldin;
+        const TIn *inptr2 = inptr1 + ldin;
+
+        prefetch_3x(inptr);
+        prefetch_3x(inptr1);
+        prefetch_3x(inptr2);
+
+        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+            switch(k) {
+                case 3:
+                    moveblock_1x2(inptr, inptr1, outptr);
+                    moveblock_1x1(inptr2, outptr + IntBy * 2);
+                    break;
+
+                case 2:
+                    moveblock_1x2(inptr, inptr1, outptr);
+                    break;
+
+                case 1:
+                    moveblock_1x1(inptr, outptr);
+                    break;
+                default:
+                    break;
+            }
+
+            outptr  += ldout;
+        }
+    }
+
+    // Cope with ragged X cases
+    const unsigned int overflow = (xmax - x0) % IntBy;
+    if (overflow) {
+        const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+        TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
+
+        for (int k=(kmax-k0); k>0; k--) {
+            const TIn *inptr = inptr_base;
+            inptr_base += ldin;
+
+            for (unsigned int x=0; x < IntBy; x++) {
+                TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+                *outptr++ = val;
+            }
+        }
+    }
+}
+};
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 6078abd06b..8918843c98 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -35,23 +35,23 @@ class IScheduler
 {
 public:
     /** Default constructor. */
-    IScheduler()
-        : _target(CPUTarget::INTRINSICS)
-    {
-    }
+    IScheduler();
 
     /** Destructor. */
     virtual ~IScheduler() = default;
+
     /** Sets the number of threads the scheduler will use to run the kernels.
      *
      * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
      */
     virtual void set_num_threads(unsigned int num_threads) = 0;
+
     /** Returns the number of threads that the SingleThreadScheduler has in his pool.
      *
      * @return Number of threads available in SingleThreadScheduler.
      */
     virtual unsigned int num_threads() const = 0;
+
     /** Runs the kernel in the same thread as the caller synchronously.
      *
      * @param[in] kernel          Kernel to execute.
@@ -65,24 +65,14 @@ public:
      */
     void set_target(CPUTarget target);
 
-    /** Return the current CPU target.
+    /** Get CPU info.
      *
-     * @return Target CPU.
+     * @return CPU info.
      */
-    CPUTarget target() const;
+    CPUInfo cpu_info() const;
 
 protected:
-    CPUTarget _target;
+    CPUInfo _info{};
 };
-
-inline void IScheduler::set_target(CPUTarget target)
-{
-    _target = target;
-}
-
-inline CPUTarget IScheduler::target() const
-{
-    return _target;
-}
 }
 #endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index 8e040b3055..893dfa0f9d 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -28,6 +28,7 @@
 
 #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
@@ -37,6 +38,8 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
@@ -59,6 +62,7 @@ public:
      *                          Data types supported: Same as @p weights.
      */
     void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+
     // Inherited methods overridden:
     void run() override;
 
@@ -82,6 +86,7 @@ class NEConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
     /** Set the input and output tensors.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -96,23 +101,26 @@ public:
      *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      */
     void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
+
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                      _memory_group;
-    NEIm2ColKernel                   _input_im2col_kernel;
-    NEGEMMInterleave4x4Kernel        _input_interleave_kernel;
-    NEConvolutionLayerReshapeWeights _reshape_weights;
-    NEGEMMMatrixMultiplyKernel       _mm_kernel;
-    NECol2ImKernel                   _output_col2im_kernel;
-    Tensor                           _input_im2col_reshaped;
-    Tensor                           _input_interleaved_reshaped;
-    Tensor                           _weights_reshaped;
-    Tensor                           _gemm_output;
-    bool                             _has_bias;
-    bool                             _is_fully_connected_convolution;
-    bool                             _are_weights_reshaped;
+    MemoryGroup                               _memory_group;
+    NEIm2ColKernel                            _input_im2col_kernel;
+    NEGEMMInterleave4x4Kernel                 _input_interleave_kernel;
+    NEConvolutionLayerReshapeWeights          _reshape_weights;
+    NEGEMMMatrixMultiplyKernel                _mm_kernel;
+    std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
+    NECol2ImKernel                            _output_col2im_kernel;
+    Tensor                                    _input_im2col_reshaped;
+    Tensor                                    _input_interleaved_reshaped;
+    Tensor                                    _weights_reshaped;
+    Tensor                                    _gemm_output;
+    Tensor                                    _workspace;
+    bool                                      _has_bias;
+    bool                                      _is_fully_connected_convolution;
+    bool                                      _are_weights_reshaped;
 };
 }
 #endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index b4b9e8be01..068e7c5ce8 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_NEGEMM_H__
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
@@ -51,6 +52,7 @@ class NEGEMM : public IFunction
 public:
     /** Constructor */
     NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -69,15 +71,17 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEGEMMInterleave4x4Kernel  _interleave_kernel;
-    NEGEMMTranspose1xWKernel   _transpose_kernel;
-    NEGEMMMatrixMultiplyKernel _mm_kernel;
-    NEGEMMMatrixAdditionKernel _ma_kernel;
-    Tensor                     _tmp_a;
-    Tensor                     _tmp_b;
-    bool                       _run_vector_matrix_multiplication;
-    bool                       _run_addition;
+    MemoryGroup                               _memory_group;
+    NEGEMMInterleave4x4Kernel                 _interleave_kernel;
+    NEGEMMTranspose1xWKernel                  _transpose_kernel;
+    NEGEMMMatrixMultiplyKernel                _mm_kernel;
+    std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
+    NEGEMMMatrixAdditionKernel                _ma_kernel;
+    Tensor                                    _tmp_a;
+    Tensor                                    _tmp_b;
+    Tensor                                    _workspace;
+    bool                                      _run_vector_matrix_multiplication;
+    bool                                      _run_addition;
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/scripts/add_copyright.py b/scripts/add_copyright.py
index 0c5b8f00cf..a9d4929db8 100755
--- a/scripts/add_copyright.py
+++ b/scripts/add_copyright.py
@@ -71,7 +71,7 @@ for top in ['./arm_compute', './tests','./src','./examples','./utils/','./framew
                 content = fd.read()
                 _, extension = os.path.splitext(f)
 
-                if extension in ['.cpp', '.h', '.inl', '.cl']:
+                if extension in ['.cpp', '.h', '.hpp', '.inl', '.cl']:
                     if not  content.startswith('/*'):
                         add_cpp_copyright(path, content)
                 elif extension == '.py' or f in ['SConstruct', 'SConscript']:
diff --git a/scripts/check_bad_style.sh b/scripts/check_bad_style.sh
index 386824015f..ab2b1a016d 100755
--- a/scripts/check_bad_style.sh
+++ b/scripts/check_bad_style.sh
@@ -5,7 +5,7 @@ set -e
 
 DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils ./support"
 
-grep -HrnP "/\*\*$" $DIRECTORIES | tee bad_style.log
+grep -HrnP --exclude-dir=assembly "/\*\*$" $DIRECTORIES | tee bad_style.log
 if (( `cat bad_style.log | wc -l` > 0 ))
 then
     echo ""
@@ -13,7 +13,7 @@ then
     exit -1
 fi
 
-grep -Hnr --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
+grep -Hnr --exclude-dir=assembly --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
 if (( `cat bad_style.log | wc -l` > 0 ))
 then
     echo ""
@@ -21,7 +21,7 @@ then
     exit -1
 fi
 
-grep -HnRE "\buint " --exclude-dir=cl_kernels $DIRECTORIES | tee bad_style.log
+grep -HnRE --exclude-dir=assembly "\buint " --exclude-dir=cl_kernels $DIRECTORIES | tee bad_style.log
 if [[ $(cat bad_style.log | wc -l) > 0 ]]
 then
     echo ""
@@ -29,7 +29,7 @@ then
     exit -1
 fi
 
-grep -HnR "float32_t" $DIRECTORIES | tee bad_style.log
+grep -HnR --exclude-dir=assembly "float32_t" $DIRECTORIES | tee bad_style.log
 if [[ $(cat bad_style.log | wc -l) > 0 ]]
 then
     echo ""
@@ -37,7 +37,7 @@ then
     exit -1
 fi
 
-grep -Hnir "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
 if [[ $(cat bad_style.log | wc -l) > 0 ]]
 then
     echo ""
@@ -45,7 +45,7 @@ then
     exit -1
 fi
 
-grep -Hnir "#.*defined[^(]" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "#.*defined[^(]" $DIRECTORIES | tee bad_style.log
 if [[ $(cat bad_style.log | wc -l) > 0 ]]
 then
     echo ""
@@ -53,7 +53,7 @@ then
     exit -1
 fi
 
-grep -Hnir "#else$\|#endif$" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "#else$\|#endif$" $DIRECTORIES | tee bad_style.log
 if [[ $(cat bad_style.log | wc -l) > 0 ]]
 then
     echo ""
@@ -61,7 +61,7 @@ then
     exit -1
 fi
 
-grep -Hnir "ARM_COMPUTE_ENABLE_FP16" ./tests/validation/CL | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "ARM_COMPUTE_ENABLE_FP16" ./tests/validation/CL | tee bad_style.log
 if [[ $(cat bad_style.log | wc -l) > 0 ]]
 then
     echo ""
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index c3faf8736c..900413c90b 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -42,12 +42,16 @@ def filter_clang_tidy_lines( lines ):
     for i in range(0, len(lines)):
         line = lines[i]
 
+        if "/assembly/" in line:
+            continue
+
         if "error:" in line:
             if (("Utils.cpp" in line and "'arm_compute_version.embed' file not found" in line) or
                 ("cl2.hpp" in line and "cast from pointer to smaller type 'cl_context_properties' (aka 'int') loses information" in line) or
                 ("arm_fp16.h" in line) or
                 ("omp.h" in line) or
-                ("memory" in line and "cast from pointer to smaller type 'uintptr_t' (aka 'unsigned int') loses information" in line) or
+                ("cast from pointer to smaller type 'uintptr_t' (aka 'unsigned int') loses information" in line) or
+                ("cast from pointer to smaller type 'std::uintptr_t' (aka 'unsigned int') loses information" in line) or
                 ("NEMath.inl" in line and "statement expression not allowed at file scope" in line) or
                 ("Utils.h" in line and "no member named 'unmap' in 'arm_compute::Tensor'" in line) or
                 ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or
diff --git a/scripts/fix_code_formatting.sh b/scripts/fix_code_formatting.sh
index ccda38abab..a07d2615af 100755
--- a/scripts/fix_code_formatting.sh
+++ b/scripts/fix_code_formatting.sh
@@ -28,6 +28,11 @@ else
 fi
 for f in $files
 do
+    if [[ $f == *"/assembly/"* ]]
+    then
+        continue
+    fi
+
 	sed -i 's/\t/    /g' $f
 	clang-format -i -style=file $f
 	astyle -n -q $ASTYLE_PARAMETERS $f
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index c76c39aa4b..ae5d456141 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -85,10 +85,10 @@ void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const
         const uint16x4x4_t data =
         {
             {
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 0 * in_stride)),
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 1 * in_stride)),
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 2 * in_stride)),
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 3 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
             }
         };
         vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
@@ -113,10 +113,10 @@ void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const
         const uint32x4x4_t data =
         {
             {
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 0 * in_stride)),
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 1 * in_stride)),
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 2 * in_stride)),
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 3 * in_stride))
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
             }
         };
         vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
new file mode 100644
index 0000000000..d70524b6b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = _output->info()->tensor_shape().x();
+    const int K = _input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = _workspace->buffer() + offset;
+    size_t           workspace_size = _workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+                     reinterpret_cast<const float *>(in1_ptr), ldb,
+                     reinterpret_cast<float *>(out.ptr()), ldc,
+                     _alpha, _beta, workspace);
+    },
+    in0, out);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 77aa044144..a83a0bc0d3 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -178,7 +178,7 @@ void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 
     /** [Scheduler example] */
     ThreadInfo info;
-    info.cpu = _target;
+    info.cpu_info = _info;
 
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 4e46a59fd0..c8285b43a7 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 SingleThreadScheduler &SingleThreadScheduler::get()
 {
     static SingleThreadScheduler scheduler;
@@ -45,7 +45,7 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dime
 {
     ARM_COMPUTE_UNUSED(split_dimension);
     ThreadInfo info;
-    info.cpu = _target;
+    info.cpu_info = cpu_info();
     kernel->run(kernel->window(), info);
 }
 
@@ -53,3 +53,4 @@ unsigned int SingleThreadScheduler::num_threads() const
 {
     return 1;
 }
+} // namespace arm_compute
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
new file mode 100644
index 0000000000..1745764bbb
--- /dev/null
+++ b/src/runtime/IScheduler.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace
+{
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+    int fd = open("/proc/cpuinfo", 0); // NOLINT
+    std::array<char, 1200> buff{ {} };
+    char *pos     = nullptr;
+    char *end     = nullptr;
+    bool  foundid = false;
+
+    int cpu = sched_getcpu();
+
+    if(fd == -1)
+    {
+        return 0;
+    }
+
+    int charsread = read(fd, buff.data(), 1200);
+    pos           = buff.data();
+    end           = buff.data() + charsread;
+
+    close(fd);
+
+    /* So, to date I've encountered two formats for /proc/cpuinfo.
+     *
+     * One of them just lists processor : n  for each processor (with no
+     * other info), then at the end lists part information for the current
+     * CPU.
+     *
+     * The other has an entire clause (including part number info) for each
+     * CPU in the system, with "processor : n" headers.
+     *
+     * We can cope with either of these formats by waiting to see
+     * "processor: n" (where n = our CPU ID), and then looking for the next
+     * "CPU part" field.
+     */
+    while(pos < end)
+    {
+        if(foundid && strncmp(pos, "CPU part", 8) == 0)
+        {
+            /* Found part number */
+            pos += 11;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            return strtoul(pos, nullptr, 0);
+        }
+
+        if(strncmp(pos, "processor", 9) == 0)
+        {
+            /* Found processor ID, see if it's ours. */
+            pos += 11;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            int num = strtol(pos, nullptr, 0);
+
+            if(num == cpu)
+            {
+                foundid = true;
+            }
+        }
+
+        while(pos < end)
+        {
+            char ch = *pos++;
+            if(ch == '\n' || ch == '\0')
+            {
+                break;
+            }
+        }
+    }
+#endif /* BARE_METAL */
+
+    return 0;
+}
+} // namespace
+
+namespace arm_compute
+{
+IScheduler::IScheduler()
+{
+    switch(get_cpu_impl())
+    {
+        case 0xd03:
+            _info.CPU = CPUTarget::A53;
+            break;
+        default:
+#ifdef __aarch64__
+            _info.CPU = CPUTarget::ARMV8;
+#else  /* __aarch64__ */
+            _info.CPU = CPUTarget::INTRINSICS;
+#endif /* __aarch64__ */
+            break;
+    }
+
+    _info.L1_size = 31000;
+    _info.L2_size = 500000;
+}
+
+void IScheduler::set_target(CPUTarget target)
+{
+    _info.CPU = target;
+}
+
+CPUInfo IScheduler::cpu_info() const
+{
+    return _info;
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 0466a4a501..44bf2de70c 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -23,17 +23,25 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
 
 #include <cmath>
 #include <tuple>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
@@ -69,8 +77,10 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I
 
         _weights_reshaped.allocator()->init(info_wr);
         _memory_group.manage(&_weights_reshaped);
+
         _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
         _weights_transposed_kernel.configure(&_weights_reshaped, output);
+
         _weights_reshaped.allocator()->allocate();
     }
     else
@@ -84,6 +94,7 @@ void NEConvolutionLayerReshapeWeights::run()
     _memory_group.acquire();
 
     NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
     if(_transpose1xW)
     {
         NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
@@ -93,8 +104,8 @@ void NEConvolutionLayerReshapeWeights::run()
 }
 
 NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
-      _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
 {
 }
 
@@ -137,45 +148,72 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
                                                  conv_info);
 
-    // Check if its a "fully connected" convolution
+    // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
+#if defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+    }
+#endif /* defined(__aarch64__) */
+
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
 
     // Reshape weights if needed
-    if(_are_weights_reshaped)
+    if(_mm_optimised_kernel != nullptr)
     {
-        mat_weights_cols                         = weights_info.num_kernels();
-        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
-        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+        if(_are_weights_reshaped)
+        {
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(1);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            weights = &_weights_reshaped;
+        }
     }
     else
     {
-        if(_is_fully_connected_convolution)
+        if(_are_weights_reshaped)
         {
-            // Create tensor to store the reshaped weights
-            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wr);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
         }
         else
         {
-            // Create tensor to store transposed weights
-            const float transpose_width = 16.0f / input->info()->element_size();
-            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wt);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+            TensorShape reshaped_weights_shape;
+
+            if(_is_fully_connected_convolution)
+            {
+                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+            }
+            else
+            {
+                // Create tensor to store transposed weights
+                const float transpose_width = 16.0f / input->info()->element_size();
+                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+            }
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, !_is_fully_connected_convolution /* 1xW transpose */);
+            weights = &_weights_reshaped;
         }
-        weights = &_weights_reshaped;
     }
 
     // Create tensor to store im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
     const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
+
+    TensorShape shape_im2col(input->info()->tensor_shape());
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
@@ -185,7 +223,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
     // Create tensor (interleave) to prepare input tensor for GEMM
     if(!_is_fully_connected_convolution)
     {
-        TensorShape shape_interleaved = shape_im2col;
+        TensorShape shape_interleaved(shape_im2col);
         shape_interleaved.set(0, shape_interleaved.x() * 4);
         shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
         _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
@@ -193,7 +231,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
     }
 
     // Create GEMM output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
@@ -201,16 +239,49 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
 
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    if(_is_fully_connected_convolution)
+
+#if defined(__aarch64__)
+    if(_mm_optimised_kernel != nullptr)
     {
-        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+        const int M = _gemm_output.info()->tensor_shape().y();
+        const int N = _gemm_output.info()->tensor_shape().x();
+        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+
+        GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+
+        constexpr size_t alignment = 4096;
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        if(_is_fully_connected_convolution)
+        {
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+        }
+        else
+        {
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
+        }
+
+        _workspace.allocator()->allocate();
     }
     else
+#endif /* defined(__aarch64__) */
     {
-        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
-        _input_interleaved_reshaped.allocator()->allocate();
+        if(_is_fully_connected_convolution)
+        {
+            _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        }
+        else
+        {
+            _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+            _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+            _input_interleaved_reshaped.allocator()->allocate();
+        }
     }
+
     _input_im2col_reshaped.allocator()->allocate();
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
     _gemm_output.allocator()->allocate();
@@ -237,17 +308,26 @@ void NEConvolutionLayer::run()
 
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
-    if(!_is_fully_connected_convolution)
+
+    // Runs matrix multiply on reshaped matrices
+    if(_mm_optimised_kernel != nullptr)
     {
-        // Run interleave
-        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
     }
+    else
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            // Run interleave
+            NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+        }
 
-    // Runs matrix multiply on reshaped matrices
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+    }
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
 
     _memory_group.release();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 85b283cd41..1d6aa65e37 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,18 +26,27 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
-#include <cmath>
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
 
-using namespace arm_compute;
+#include <cmath>
 
+namespace arm_compute
+{
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+      _run_vector_matrix_multiplication(false), _run_addition(false)
 {
 }
 
@@ -57,57 +66,94 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
 
-    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
-    if((a->info()->dimension(1) == 1))
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+
+#if defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
     {
-        _run_vector_matrix_multiplication = true;
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+    }
+#endif /* defined(__aarch64__) */
 
+    // Check if the first input tensor is a vector.
+    // If so, all the kernels for reshaping the tensors can be skipped
+    if(_run_vector_matrix_multiplication)
+    {
         // Configure the matrix multiply kernel
         _mm_kernel.configure(a, b, d, alpha);
+
+        // Configure matrix addition kernel
+        if(beta != 0 && c != nullptr)
+        {
+            _ma_kernel.configure(c, d, beta);
+            _run_addition = true;
+        }
     }
     else
     {
-        _run_vector_matrix_multiplication = false;
+#if defined(__aarch64__)
+        if(_mm_optimised_kernel != nullptr)
+        {
+            struct CPUInfo ci = NEScheduler::get().cpu_info();
 
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
+            const int M = d->info()->tensor_shape().y();
+            const int N = d->info()->tensor_shape().x();
+            const int K = a->info()->tensor_shape().x();
 
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+            GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
 
-        const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
-        shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+            constexpr size_t alignment = 4096;
+            _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            _memory_group.manage(&_workspace);
 
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+            // Configure matrix multiplication kernel
+            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
 
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
+            _workspace.allocator()->allocate();
+        }
+        else
+#endif /* defined(__aarch64__) */
+        {
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
 
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
 
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
+            const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
+            shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
 
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
 
-        // Configure matrix multiplication kernel
-        _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
 
-        // Allocate once the all configure methods have been called
-        _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
-    }
+            // Manage intermediate buffers
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
 
-    // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
-    {
-        _ma_kernel.configure(c, d, beta);
-        _run_addition = true;
+            // Configure interleave kernel
+            _interleave_kernel.configure(a, &_tmp_a);
+
+            // Configure transpose kernel
+            _transpose_kernel.configure(b, &_tmp_b);
+
+            // Configure matrix multiplication kernel
+            _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+            // Allocate once the all configure methods have been called
+            _tmp_a.allocator()->allocate();
+            _tmp_b.allocator()->allocate();
+
+            // Configure matrix addition kernel
+            if(beta != 0 && c != nullptr)
+            {
+                _ma_kernel.configure(c, d, beta);
+                _run_addition = true;
+            }
+        }
     }
 }
 
@@ -115,23 +161,31 @@ void NEGEMM::run()
 {
     _memory_group.acquire();
 
-    if(!_run_vector_matrix_multiplication)
+    if(_mm_optimised_kernel != nullptr)
     {
-        // Run interleave kernel
-        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
-
-        // Run transpose kernel
-        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _memory_group.release();
     }
+    else
+    {
+        if(!_run_vector_matrix_multiplication)
+        {
+            // Run interleave kernel
+            NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
-    // Run matrix multiply kernel
-    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+            // Run transpose kernel
+            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+        }
 
-    _memory_group.release();
+        NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
-    // Run matrix addition kernel
-    if(_run_addition)
-    {
-        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        _memory_group.release();
+
+        // Run matrix addition kernel
+        if(_run_addition)
+        {
+            NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        }
     }
 }
+} // namespace arm_compute
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 87e9bd2bc8..b9d9103652 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -268,6 +268,23 @@ inline std::string to_string(bool value)
     str << std::boolalpha << value;
     return str.str();
 }
+
+// std::align is missing in GCC 4.9
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57350
+inline void *align(std::size_t alignment, std::size_t size, void *&ptr, std::size_t &space)
+{
+    std::uintptr_t pn      = reinterpret_cast<std::uintptr_t>(ptr);
+    std::uintptr_t aligned = (pn + alignment - 1) & -alignment;
+    std::size_t    padding = aligned - pn;
+    if(space < size + padding)
+    {
+        return nullptr;
+    }
+
+    space -= padding;
+
+    return ptr = reinterpret_cast<void *>(aligned);
+}
 } // namespace cpp11
 
 namespace cpp14
diff --git a/tests/networks/AlexNetNetwork.h b/tests/networks/AlexNetNetwork.h
index 1e99503792..0c06c1860f 100644
--- a/tests/networks/AlexNetNetwork.h
+++ b/tests/networks/AlexNetNetwork.h
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
 #define __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
 
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include "tests/AssetsLibrary.h"
@@ -80,55 +81,64 @@ public:
             w[7].allocator()->init(TensorInfo(TensorShape(4096U, 1000U), 1, _data_type, _fixed_point_position));
             b[7].allocator()->init(TensorInfo(TensorShape(1000U), 1, _data_type, _fixed_point_position));
 
-            w21 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
-            w22 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
-            b21 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
-            b22 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
+            w11 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+            w12 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+            b11 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
+            b12 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
 
-            w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
-            w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
-            b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
-            b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
+            w31 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+            w32 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+            b31 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
+            b32 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
 
-            w51 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
-            w52 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
-            b51 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
-            b52 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
+            w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+            w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+            b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
+            b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
         }
         else
         {
-            auto reshape = [&](unsigned int width, unsigned int height) -> TensorShape
+            auto reshape = [&](unsigned int width, unsigned int height, bool convolution_layer) -> TensorShape
             {
-                const int interleave_width = 16 / arm_compute::data_size_from_type(_data_type);
+                const bool is_optimised = std::is_same<ITensorType, ITensor>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
 
-                return TensorShape{ width * interleave_width, static_cast<unsigned int>(std::ceil(static_cast<float>(height) / interleave_width)) };
+                if(convolution_layer && is_optimised)
+                {
+                    return TensorShape{ height, width };
+                }
+                else
+                {
+                    const int interleave_width = 16 / arm_compute::data_size_from_type(_data_type);
+
+                    return TensorShape{ width * interleave_width, static_cast<unsigned int>(std::ceil(static_cast<float>(height) / interleave_width)) };
+                }
             };
 
             // Create tensor for the reshaped weights
-            w[0].allocator()->init(TensorInfo(reshape(366U, 96U), 1, _data_type, _fixed_point_position));
+            w[0].allocator()->init(TensorInfo(reshape(366U, 96U, true), 1, _data_type, _fixed_point_position));
 
             // Configure the direct convolution's weights. Direct convolution doesn't need reshape weights
             if(!_is_direct_conv)
             {
-                auto w21_tensor = std::unique_ptr<TensorType>(new TensorType());
-                auto w22_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w11_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w12_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w31_tensor = std::unique_ptr<TensorType>(new TensorType());
+                auto w32_tensor = std::unique_ptr<TensorType>(new TensorType());
                 auto w41_tensor = std::unique_ptr<TensorType>(new TensorType());
                 auto w42_tensor = std::unique_ptr<TensorType>(new TensorType());
-                auto w51_tensor = std::unique_ptr<TensorType>(new TensorType());
-                auto w52_tensor = std::unique_ptr<TensorType>(new TensorType());
-                w21_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U), 1, _data_type, _fixed_point_position));
-                w22_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U), 1, _data_type, _fixed_point_position));
-                w41_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U), 1, _data_type, _fixed_point_position));
-                w42_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U), 1, _data_type, _fixed_point_position));
-                w51_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U), 1, _data_type, _fixed_point_position));
-                w52_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U), 1, _data_type, _fixed_point_position));
-                w[2].allocator()->init(TensorInfo(reshape(2560U, 384U), 1, _data_type, _fixed_point_position));
-                w21 = std::move(w21_tensor);
-                w22 = std::move(w22_tensor);
+                w11_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U, true), 1, _data_type, _fixed_point_position));
+                w12_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U, true), 1, _data_type, _fixed_point_position));
+                w31_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U, true), 1, _data_type, _fixed_point_position));
+                w32_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U, true), 1, _data_type, _fixed_point_position));
+                w41_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U, true), 1, _data_type, _fixed_point_position));
+                w42_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U, true), 1, _data_type, _fixed_point_position));
+                w[2].allocator()->init(TensorInfo(reshape(2560U, 384U, true), 1, _data_type, _fixed_point_position));
+                w11 = std::move(w11_tensor);
+                w12 = std::move(w12_tensor);
+                w31 = std::move(w31_tensor);
+                w32 = std::move(w32_tensor);
                 w41 = std::move(w41_tensor);
                 w42 = std::move(w42_tensor);
-                w51 = std::move(w51_tensor);
-                w52 = std::move(w52_tensor);
             }
             else
             {
@@ -140,20 +150,20 @@ public:
                 b[3].allocator()->init(TensorInfo(TensorShape(384U), 1, _data_type, _fixed_point_position));
                 w[4].allocator()->init(TensorInfo(TensorShape(3U, 3U, 192U, 256U), 1, _data_type, _fixed_point_position));
                 b[4].allocator()->init(TensorInfo(TensorShape(256U), 1, _data_type, _fixed_point_position));
-                w21 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
-                w22 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
-                b21 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
-                b22 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
-
-                w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
-                w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
-                b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
-                b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
-
-                w51 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
-                w52 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
-                b51 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
-                b52 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
+                w11 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+                w12 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+                b11 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
+                b12 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
+
+                w31 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+                w32 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+                b31 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
+                b32 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
+
+                w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+                w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+                b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
+                b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
             }
 
             b[5].allocator()->init(TensorInfo(TensorShape(4096U), 1, _data_type, _fixed_point_position));
@@ -162,9 +172,9 @@ public:
 
             if(_batches > 1 && std::is_same<TensorType, Tensor>::value)
             {
-                w[5].allocator()->init(TensorInfo(reshape(9216U, 4096U), 1, _data_type, _fixed_point_position));
-                w[6].allocator()->init(TensorInfo(reshape(4096U, 4096U), 1, _data_type, _fixed_point_position));
-                w[7].allocator()->init(TensorInfo(reshape(4096U, 1000U), 1, _data_type, _fixed_point_position));
+                w[5].allocator()->init(TensorInfo(reshape(9216U, 4096U, false), 1, _data_type, _fixed_point_position));
+                w[6].allocator()->init(TensorInfo(reshape(4096U, 4096U, false), 1, _data_type, _fixed_point_position));
+                w[7].allocator()->init(TensorInfo(reshape(4096U, 1000U, false), 1, _data_type, _fixed_point_position));
             }
             else
             {
@@ -230,8 +240,8 @@ public:
         norm1.configure(&act1_out, &norm1_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
         pool1.configure(&norm1_out, &pool1_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
         // Layer 2
-        conv21.configure(pool11_out.get(), w21.get(), b21.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
-        conv22.configure(pool12_out.get(), w22.get(), b22.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
+        conv21.configure(pool11_out.get(), w11.get(), b11.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
+        conv22.configure(pool12_out.get(), w12.get(), b12.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
         act2.configure(&conv2_out, &act2_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         norm2.configure(&act2_out, &norm2_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
         pool2.configure(&norm2_out, &pool2_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
@@ -240,12 +250,12 @@ public:
         conv3.configure(&pool2_out, &w[2], b2, &conv3_out, PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 384U));
         act3.configure(&conv3_out, &act3_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         // Layer 4
-        conv41.configure(act31_out.get(), w41.get(), b41.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
-        conv42.configure(act32_out.get(), w42.get(), b42.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
+        conv41.configure(act31_out.get(), w31.get(), b31.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
+        conv42.configure(act32_out.get(), w32.get(), b32.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
         act4.configure(&conv4_out, &act4_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         // Layer 5
-        conv51.configure(act41_out.get(), w51.get(), b51.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
-        conv52.configure(act42_out.get(), w52.get(), b52.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
+        conv51.configure(act41_out.get(), w41.get(), b41.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
+        conv52.configure(act42_out.get(), w42.get(), b42.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
         act5.configure(&conv5_out, &act5_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
         pool5.configure(&act5_out, &pool5_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
         // Layer 6
@@ -291,12 +301,12 @@ public:
 
             if(!_is_direct_conv)
             {
-                dynamic_cast<TensorType *>(w21.get())->allocator()->allocate();
-                dynamic_cast<TensorType *>(w22.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w11.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w12.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w31.get())->allocator()->allocate();
+                dynamic_cast<TensorType *>(w32.get())->allocator()->allocate();
                 dynamic_cast<TensorType *>(w41.get())->allocator()->allocate();
                 dynamic_cast<TensorType *>(w42.get())->allocator()->allocate();
-                dynamic_cast<TensorType *>(w51.get())->allocator()->allocate();
-                dynamic_cast<TensorType *>(w52.get())->allocator()->allocate();
             }
             else
             {
@@ -359,12 +369,12 @@ public:
 
             if(!_is_direct_conv)
             {
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w21.get())), 9);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w22.get())), 10);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 11);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 12);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w51.get())), 13);
-                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w52.get())), 14);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w11.get())), 9);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w12.get())), 10);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w31.get())), 11);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w32.get())), 12);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 13);
+                library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 14);
             }
             else
             {
@@ -481,18 +491,18 @@ public:
             }
         }
 
-        w21.reset();
-        w22.reset();
-        b21.reset();
-        b21.reset();
+        w11.reset();
+        w12.reset();
+        b11.reset();
+        b11.reset();
+        w31.reset();
+        w32.reset();
+        b31.reset();
+        b32.reset();
         w41.reset();
         w42.reset();
         b41.reset();
         b42.reset();
-        w51.reset();
-        w52.reset();
-        b51.reset();
-        b52.reset();
 
         conv1_out.allocator()->free();
         act1_out.allocator()->free();
@@ -595,9 +605,9 @@ private:
 
     TensorType input{}, output{};
     std::array<TensorType, 8> w{ {} }, b{ {} };
-    std::unique_ptr<ITensorType> w21{ nullptr }, w22{ nullptr }, b21{ nullptr }, b22{ nullptr };
+    std::unique_ptr<ITensorType> w11{ nullptr }, w12{ nullptr }, b11{ nullptr }, b12{ nullptr };
+    std::unique_ptr<ITensorType> w31{ nullptr }, w32{ nullptr }, b31{ nullptr }, b32{ nullptr };
     std::unique_ptr<ITensorType> w41{ nullptr }, w42{ nullptr }, b41{ nullptr }, b42{ nullptr };
-    std::unique_ptr<ITensorType> w51{ nullptr }, w52{ nullptr }, b51{ nullptr }, b52{ nullptr };
 
     TensorType conv1_out{}, act1_out{}, norm1_out{}, pool1_out{};
     TensorType conv2_out{}, act2_out{}, pool2_out{}, norm2_out{};
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index dd2df727e9..fcaf4ef42b 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -39,6 +40,8 @@
 
 namespace arm_compute
 {
+class NEConvolutionLayer;
+
 namespace test
 {
 namespace validation
@@ -85,6 +88,8 @@ protected:
         {
             // Check if its a "fully connected" convolution
             const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
+            const bool is_optimised                   = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
+
             reshaped_weights_shape.collapse(3);
 
             if(bias_shape.total_size() > 0)
@@ -92,7 +97,7 @@ protected:
                 reshaped_weights_shape.set(0, reshaped_weights_shape.x() + 1);
             }
 
-            if(is_fully_connected_convolution)
+            if(is_fully_connected_convolution || is_optimised)
             {
                 const size_t shape_x = reshaped_weights_shape.x();
                 reshaped_weights_shape.set(0, reshaped_weights_shape.y());
@@ -138,6 +143,7 @@ protected:
         if(!reshape_weights)
         {
             const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
+            const bool is_optimised                   = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
 
             TensorShape     tmp_weights_shape(weights_shape);
             SimpleTensor<T> tmp_weights(tmp_weights_shape, data_type, 1, fixed_point_position);
@@ -149,7 +155,7 @@ protected:
 
             tmp_weights = linearise_weights(tmp_weights, &tmp_bias);
 
-            if(!is_fully_connected_convolution)
+            if(!is_fully_connected_convolution && !is_optimised)
             {
                 // Transpose with interleave
                 const int interleave_size = 16 / tmp_weights.element_size();