30 files changed, 0 insertions, 7941 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
deleted file mode 100644
index d78d33c647..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-// Actual kernel implementations
-#include "a32_sgemm_8x6/a53.hpp"
-#include "a32_sgemm_8x6/a55r1.hpp"
-#include "a32_sgemm_8x6/generic.hpp"
-
-// 8x6 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class sgemm_8x6 {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 6;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 8;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 8;
-    static const int out_height = 6;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    sgemm_8x6(const CPUInfo *ci) {
-        switch(ci->CPU) {
-            case CPUTarget::A53:
-                kernel = a32_sgemm_8x6_a53;
-                break;
-
-            case CPUTarget::A55_DOT:
-                kernel = a32_sgemm_8x6_a55r1;
-                break;
-
-            default:
-                kernel = a32_sgemm_8x6;
-                break;
-        }
-    }
-};
-
-#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp
deleted file mode 100644
index 6bfbfc8742..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    printf("CIAO SONO IO, AMORE MIO!\n");
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int tails = (K & 3);
-            if (tails == 0) {
-                tails = 4;
-            }
-            int k = ((K+3)/4) - 1;
-
-            __asm __volatile (
-                "vmov.i32	q4, #0\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]\n"
-                "vmov.i32	q5, #0\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]\n"
-                "vmov.i32	q6, #0\n"
-                "ldr		r0, [%[a_ptr], #0x10]\n"
-                "vmov.i32	q7, #0\n"
-                "ldr		r1, [%[a_ptr], #0x14]\n"
-                "vmov.i32	q8, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x40]")
-                "vmov.i32	q9, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x40]")
-                "vmov.i32	q10, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x80]")
-                "vmov.i32	q11, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x80]")
-                "vmov.i32	q12, #0\n"
-                "vmov.i32	q13, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0xC0]")
-                "vmov.i32	q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0XC0]")
-                "vmov.i32	q15, #0\n"
-                "cmp		%[k], #0\n"
-                "beq		6f\n"
-
-                "1:\n"
-                // Unroll 0
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmov		d2, r0, r1\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "ldr		r0, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "ldr		r1, [%[b_ptr], #0x1C]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-
-                "vldr		d3, [%[a_ptr], #0x18]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x100]")
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x20]\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "ldr		r0, [%[b_ptr], #0x28]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "ldr		r1, [%[b_ptr], #0x2C]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-
-                "vldr		d0, [%[a_ptr], #0x20]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "ldr		r0, [%[a_ptr], #0x28]\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "ldr		r1, [%[a_ptr], #0x2C]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-
-                // Unroll 1
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-                "vmov		d1, r0, r1\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "ldr		r0, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "ldr		r1, [%[b_ptr], #0x3C]\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-
-                "vldr		d2, [%[a_ptr], #0x30]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0x100]")
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "ldr		r0, [%[b_ptr], #0x48]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "ldr		r1, [%[b_ptr], #0x4C]\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-
-                "vldr		d3, [%[a_ptr], #0x38]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "ldr		r0, [%[a_ptr], #0x40]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "ldr		r1, [%[a_ptr], #0x44]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-
-                // Unroll 2
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-                "vmov		d0, r0, r1\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "ldr		r0, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "ldr		r1, [%[b_ptr], #0x5C]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-
-                "vldr		d1, [%[a_ptr], #0x48]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x140]")
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "ldr		r0, [%[b_ptr], #0x68]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "ldr		r1, [%[b_ptr], #0x6C]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-
-                "vldr		d2, [%[a_ptr], #0x50]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "ldr		r0, [%[a_ptr], #0x58]\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "ldr		r1, [%[a_ptr], #0x5C]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "add		%[a_ptr], %[a_ptr], #0x60\n"
-
-                // Unroll 3
-                "vldr		d6, [%[b_ptr], #0x70]\n"
-                "vmov		d3, r0, r1\n"
-                "vmla.f32	q4, q2, d1[0]\n"
-                "ldr		r0, [%[b_ptr], #0x78]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "ldr		r1, [%[b_ptr], #0x7C]\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "add		%[b_ptr], %[b_ptr], #0x80\n"
-
-                "vldr		d0, [%[a_ptr], #0x00]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0xC0]")
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x00]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "ldr		r0, [%[b_ptr], #0x08]\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "ldr		r1, [%[b_ptr], #0x0C]\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "subs		%[k], %[k], #1\n"
-
-                "vldr		d1, [%[a_ptr], #0x08]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "ldr		r0, [%[a_ptr], #0x10]\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "ldr		r1, [%[a_ptr], #0x14]\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "bne		1b\n"
-
-                // "Tails" shows how many multiply blocks are needed at the
-                // end, must be 1-4 inclusive.  Bail out to alternative tail
-                // immediately if it's 1.
-                "6:\n"
-                "subs		%[tails], %[tails], #1\n"
-                "beq		3f\n"
-
-                // Detached final iteration - for now adapt the generic
-                // tails rather than reimplementing for A53.
-
-                // Unroll 0
-                "vmov		d2, r0, r1\n"
-                "add		%[a_ptr], %[a_ptr], #0x18\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "add		%[b_ptr], %[b_ptr], #0x10\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		4f\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		5f\n"
-
-                // Unroll 2
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==1 final tail
-                "3:\n"
-                "vmov		d2, r0, r1\n"
-                "add		%[b_ptr], %[b_ptr], #0x10\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "add		%[a_ptr], %[a_ptr], #0x18\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==2 final tail
-                "4:\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==3 final tail
-                "5:\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vld1.32	{d0}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-
-                "2:\n"
-                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
-            );
-        }
-    }
-}
-
-#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp
deleted file mode 100644
index 4f0ef7cd21..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    /* Work out starting values for "k" and "tails" in the inner loop. */
-    int tails_initial = (K & 3);
-    if (tails_initial == 0) {
-        tails_initial = 4;
-    }
-
-    int k_initial = ((K+3)/4) - 1;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            int tails = tails_initial;
-            int k = k_initial;
-
-            a_ptr = a_ptr0;
-
-            __asm __volatile (
-                "vldr		d0, [%[a_ptr]]\n"
-                "vmov.i32	q4, #0\n"
-                "vldr		d1, [%[a_ptr], #0x08]\n"
-                "vmov.i32	q5, #0\n"
-                "vldr		d4, [%[b_ptr]]\n"
-                "vmov.i32	q6, #0\n"
-                "vldr		d5, [%[b_ptr], #0x08]\n"
-                "vmov.i32	q7, #0\n"
-                "vldr		d2, [%[a_ptr], #0x10]\n"
-                "vmov.i32	q8, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x40]")
-                "vmov.i32	q9, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x40]")
-                "vmov.i32	q10, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x80]")
-                "vmov.i32	q11, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x80]")
-                "vmov.i32	q12, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0XC0]")
-                "vmov.i32	q13, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0xC0]")
-                "vmov.i32	q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x100]")
-                "vmov.i32	q15, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x100]")
-                "cmp		%[k], #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x140]")
-                "beq		6f\n"
-                ASM_PREFETCH("[%[b_ptr], #0x180]")
-
-                "1:\n"
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vldr		d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vldr		d3, [%[a_ptr], #0x18]\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x140]")
-                "vmla.f32	q8, q2, d2[0]\n"
-                "subs		%[k], %[k], #1\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vldr		d4, [%[b_ptr], #0x20]\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vldr		d5, [%[b_ptr], #0x28]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x20]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vldr		d1, [%[a_ptr], #0x28]\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vldr		d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vldr		d2, [%[a_ptr], #0x30]\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-
-                "vmla.f32	q7, q2, d0[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0x1C0]")
-                "vmla.f32	q8, q2, d1[0]\n"
-
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vldr		d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vldr		d5, [%[b_ptr], #0x48]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vldr		d3, [%[a_ptr], #0x38]\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x40]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-
-                // Unroll 2
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vldr		d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vldr		d1, [%[a_ptr], #0x48]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-
-                "vmla.f32	q7, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x180]")
-                "vmla.f32	q8, q2, d0[0]\n"
-
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vldr		d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vldr		d5, [%[b_ptr], #0x68]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vldr		d2, [%[a_ptr], #0x50]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vldr		d3, [%[a_ptr], #0x58]\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "add		%[a_ptr], %[a_ptr], #0x60\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vldr		d6, [%[b_ptr], #0x70]\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vldr		d7, [%[b_ptr], #0x78]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "add		%[b_ptr], %[b_ptr], #0x80\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vldr		d0, [%[a_ptr], #0x00]\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0x180]")
-                "vmla.f32	q8, q2, d3[0]\n"
-
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vldr		d4, [%[b_ptr], #0x00]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vldr		d5, [%[b_ptr], #0x08]\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vldr		d1, [%[a_ptr], #0x08]\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vldr		d2, [%[a_ptr], #0x10]\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-
-                "vmla.f32	q15, q3, d3[1]\n"
-                "bne		1b\n"
-
-                // "Tails" shows how many multiply blocks are needed at the
-                // end, must be 1-4 inclusive.  Bail out to alternative tail
-                // immediately if it's 1.
-                "6:\n"
-                "subs		%[tails], %[tails], #1\n"
-                "beq		3f\n"
-
-                // Detached final iteration
-
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vldr		d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vldr		d3, [%[a_ptr], #0x18]\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vldr		d4, [%[b_ptr], #0x20]\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vldr		d5, [%[b_ptr], #0x28]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x20]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "add		%[b_ptr], %[b_ptr], #0x30\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vldr		d1, [%[a_ptr], #0x28]\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "beq		4f\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vldr		d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vldr		d2, [%[a_ptr], #0x30]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-
-                "vmla.f32	q9, q2, d1[1]\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vldr		d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vldr		d5, [%[b_ptr], #0x48]\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vldr		d3, [%[a_ptr], #0x38]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x40]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "beq		5f\n"
-
-                // Unroll 2
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vldr		d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vldr		d1, [%[a_ptr], #0x48]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vldr		d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vldr		d5, [%[b_ptr], #0x68]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vldr		d2, [%[a_ptr], #0x50]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vldr		d3, [%[a_ptr], #0x58]\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vldr		d6, [%[b_ptr], #0x70]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vldr		d7, [%[b_ptr], #0x78]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x60\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x80\n"
-                "b		2f\n"
-
-                // tails==1 final tail
-                "3:\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vldr		d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x18\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x20\n"
-                "b		2f\n"
-
-                // tails==2 final tail
-                "4:\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vldr		d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x40\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x30\n"
-                "b		2f\n"
-
-                // tails==3 final tail
-                "5:\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vldr		d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x48\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x60\n"
-
-                "2:\n"
-                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
-            );
-        }
-    }
-}
-
-#endif /* __arm__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
deleted file mode 100644
index 7a44fed5b2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "../../asmlib.hpp"
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int tails = (K & 3);
-            if (tails == 0) {
-                tails = 4;
-            }
-            int k = ((K+3)/4) - 1;
-
-            __asm __volatile (
-                "vmov.i32	q4, #0\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmov.i32	q5, #0\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-                "vmov.i32	q6, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #48]")
-                "vmov.i32	q7, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #48]")
-                "vmov.i32	q8, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #112]")
-                "vmov.i32	q9, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #112]")
-                "vmov.i32	q10, #0\n"
-                "vmov.i32	q11, #0\n"
-                "vmov.i32	q12, #0\n"
-                "vmov.i32	q13, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #176]")
-                "vmov.i32	q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #176]")
-                "vmov.i32	q15, #0\n"
-
-                "cmp		%[k], #0\n"
-                "beq		6f\n"
-
-                "1:\n"
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "subs		%[k], %[k], #1\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #208]")
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-
-                // Unroll 2
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #240]")
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #208]")
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "bne		1b\n"
-
-                // Branch here if we never execute main loop.
-                "6:\n"
-
-                // "Tails" shows how many multiply blocks are needed at the
-                // end, must be 1-4 inclusive.  Bail out to alternative tail
-                // immediately if it's 1.
-                "subs		%[tails], %[tails], #1\n"
-                "beq		3f\n"
-
-                // Detached final iteration
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		4f\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		5f\n"
-
-                // Unroll 2
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==1 final tail
-                "3:\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d2}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==2 final tail
-                "4:\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==3 final tail
-                "5:\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vld1.32	{d0}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-
-                "2:\n"
-                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
deleted file mode 100644
index f7659b9a67..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "a64_gemm_s16_12x8/generic.hpp"
-
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_s16_12x8 {
-public:
-    typedef int16_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    gemm_s16_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_s16_asimd_12x8;
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
deleted file mode 100644
index 10259b2fdf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <arm_neon.h>
-
-inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
-{
-  const int16_t *a_ptr = Apanel;
-  int32_t *c_ptr = Cpanel;
-  for (int yb = 0; yb < ablocks; yb++)
-  {
-    const int16_t *a_ptr0 = a_ptr;
-    const int16_t *b_ptr = Bpanel;
-
-    for (int xb = 0; xb < bblocks; xb++)
-    {
-      a_ptr = a_ptr0;
-      const bool odd_k = K & 0x1;
-      int k = (K+1)/2 - 1;
-
-      register int16x8_t aa asm("v0");
-      register int16x8_t ab asm("v1");
-      register int16x8_t b0 asm("v2");
-      register int16x8_t b1 asm("v3");
-      register int16x8_t b2 asm("v4");
-
-      __asm __volatile (
-        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
-        "movi v5.4s, #0\n"
-        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
-        "movi v6.4s, #0\n"
-        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
-        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
-        "movi v7.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #64]")
-        "movi v8.4s, #0\n"
-        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
-        "movi v9.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #64]")
-        "movi v10.4s, #0\n"
-        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
-        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
-        "movi v11.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #96]")
-        "movi v12.4s, #0\n"
-        "movi v13.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #96]")
-        "movi v14.4s, #0\n"
-        "movi v15.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #128]")
-        "movi v16.4s, #0\n"
-        "movi v17.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #128]")
-        "movi v18.4s, #0\n"
-        "movi v19.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #160]")
-        "movi v20.4s, #0\n"
-        "movi v21.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #160]")
-        "movi v22.4s, #0\n"
-        "movi v23.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #192]")
-        "movi v24.4s, #0\n"
-        "add %x[a_ptr], %x[a_ptr], #0x10\n"
-        "movi v25.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #192]")
-        "movi v26.4s, #0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x18\n"
-        "movi v27.4s, #0\n"
-        "movi v28.4s, #0\n"
-
-        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
-
-        "1:\n"  // Main loop
-          // First unroll
-          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
-          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          // Second unroll
-          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
-          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
-          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
-          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "add %x[a_ptr], %x[a_ptr], #0x20\n"
-          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          ASM_PREFETCH("[%[b_ptr], #320]")
-          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          ASM_PREFETCH("[%[a_ptr], #320]")
-          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          ASM_PREFETCH("[%[b_ptr], #448]")
-          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "subs %x[k], %x[k], #0x1\n"
-          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
-          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
-          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "add %x[b_ptr], %x[b_ptr], #0x30\n"
-          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "bne 1b\n"
-
-        "2:\n"  // Even tail
-          "cbnz %x[odd_k], 3f\n"
-
-          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "add %[a_ptr], %[a_ptr], #0x10\n"
-          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "add %[b_ptr], %[b_ptr], #0x18\n"
-          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "b 4f\n"  // Complete write out
-
-        "3:\n"  // Odd tail
-          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-
-        "4:\n"  // End of function
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "str q27, [%x[c_ptr], #0x140]\n"
-          "str q12, [%x[c_ptr], #0x150]\n"
-          "str q20, [%x[c_ptr], #0x160]\n"
-          "str q28, [%x[c_ptr], #0x170]\n"
-          "add %x[c_ptr], %x[c_ptr], #0x180\n"
-        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
-          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
-        : [odd_k] "r" (odd_k)
-        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
-      );
-    }
-  }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
deleted file mode 100644
index 88cbb361b3..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_s8_12x8/generic.hpp"
-
-class gemm_s8_12x8 {
-public:
-    typedef int8_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 4;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 4;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 4;
-
-    kern_type kernel = nullptr;
-
-    gemm_s8_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_s8_12x8;
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
deleted file mode 100644
index 4ac2ba4234..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
-    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
-    const int W = K/4;
-    // Fix up for odd lengths - set a flag if K is odd, but make.
-    // sure we round up the iteration count.
-    const int oddk = (W & 1);
-    const int init_value_k = ((W+1)/2) - 1;
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int k = init_value_k;
-            register int32x4_t a0  asm("v0");
-            register int32x4_t a1  asm("v1");
-            register int32x4_t b0  asm("v2");
-            register int32x4_t b1  asm("v3");
-            register int32x4_t b2  asm("v4");
-            register int32x4_t a0a asm("v5");
-            register int32x4_t a1a asm("v6");
-            __asm __volatile (
-                _DECLARE_SDOT
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "movi	v11.4s, #0x0\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-
-                // Loop proper
-                "1:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "ins    %[a0].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "ins    %[a1].d[1], x20\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "ins    %[b1].d[1], x20\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "ins    %[b2].d[1], x20\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "ins    %[b2].d[1], x20\n"
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-
-
-                ".purgem sdot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-
-
-        }
-    }
-}
-
-#endif
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
deleted file mode 100644
index 1d6fd1623e..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_SDOT ".altmacro\n"\
-    ".macro sdot opd:req, opn:req, opm:req\n"\
-    "local vd, vn, vm, h, l\n"\
-    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
-    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
-    ".set vd,\\reg\n"\
-    ".endif\n"\
-    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
-    ".set vn,\\reg\n"\
-    ".endif\n"\
-    ".irp idx,0,1,2,3\n"\
-    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
-    ".set vm,\\reg\n"\
-    ".set h,\\idx / 2\n"\
-    ".set l,\\idx %% 2\n"\
-    ".endif\n"\
-    ".endr\n"\
-    ".endr\n"\
-    ".ifndef vd\n"\
-    ".error \"Bad operand \\opd\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vn\n"\
-    ".error \"Bad operand \\opn\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vm\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef h\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef l\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".int	 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
-    ".endm\n"\
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
deleted file mode 100644
index bfad0373b2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-
-inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    K/=4;
-    const long int row_jump=0;
-    const long int block_jump=0;
-    const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel);
-    int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel);
-    for (int yb=0; yb<ablocks; yb++) {
-        const int32_t *a_ptr0 = a_ptr;
-        const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel);
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-            register int32x4_t a0  asm("v0");
-            register int32x4_t a1  asm("v1");
-            register int32x4_t b0  asm("v2");
-            register int32x4_t b1  asm("v3");
-            register int32x4_t b2  asm("v4");
-            register int32x4_t a0a asm("v5");
-            register int32x4_t a1a asm("v6");
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                _DECLARE_SDOT
-
-                // Loop proper
-                "1:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr	%q[a0], [%[a_ptr], #64]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ldr	%q[a1], [%[a_ptr], #80]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #112]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-
-                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-                ".purgem sdot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-
-
-}
-
-
-#endif 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
deleted file mode 100644
index 1588f049f4..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_s8_4x4/generic.hpp"
-
-class gemm_s8_4x4 {
-public:
-    typedef int8_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 4;
-    static const int A_block = 16;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 4;
-    static const int B_block = 16;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 4;
-    static const int out_height = 4;
-    static const int k_unroll = 16;
-
-    kern_type kernel = nullptr;
-
-    gemm_s8_4x4(const CPUInfo *ci) {
-        kernel = a64_gemm_s8_4x4;
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
deleted file mode 100644
index 0ec435b33b..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
+++ /dev/null
@@ -1,465 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
-    K /= 16;
-    int oddk = (K & 1);
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-
-            int k = ((K+1)/2)-1;
-
-            register int8x16_t b0  asm("v4");
-            register int8x16_t b1  asm("v5");
-            register int8x16_t b2  asm("v6");
-            register int8x16_t b3  asm("v7");
-            register int8x16_t b0a asm("v8");
-            register int8x16_t b1a asm("v9");
-            register int8x16_t b2a asm("v10");
-            register int8x16_t b3a asm("v11");
-
-            __asm __volatile (
-                "movi	v16.4s, #0x0\n"
-                "ldr	q0, [%[a_ptr]]\n"
-                "movi	v17.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v18.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v19.4s, #0x0\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "movi	v20.4s, #0x0\n"
-                "ldr	%q[b3], [%[b_ptr], #48]\n"
-                "movi	v21.4s, #0x0\n"
-                "ldr	q1, [%[a_ptr], #16]\n"
-                "movi	v22.4s, #0x0\n"
-                "ldr	q2, [%[a_ptr], #32]\n"
-                "movi	v23.4s, #0x0\n"
-                "ldr	q3, [%[a_ptr], #48]\n"
-                "movi	v24.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v25.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v26.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v27.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v28.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v29.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v30.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v31.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-
-                // Loop structure optimized for A57 (after r0).
-
-                // Unavoidably, the multiply will "dribble" if
-                // dual issued with an add.
-
-                // Minimize the effect of this by making sure
-                // there are 2 adds to run under the dribbled
-                // multiply.
-
-                // Pipeline in blocks of 8 multiplies - combine
-                // this iteration's multiplies with adds from
-                // the previous iteration.
-
-                // So the first block doesn't have any adds to
-                // do - but because all the adds are at the
-                // start of the block it's only the first couple
-                // of multiplies that need to be pulled out.
-
-                // Start of unroll 0 (first iteration)
-                "smull	v12.8h, v0.8b, %[b0].8b\n"
-                "smull	v13.8h, v0.8b, %[b1].8b\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                // Unroll 0 continuation (branch target)
-                "1:\n"
-                "smull	v14.8h, v0.8b, %[b2].8b\n"
-                "subs	%w[k], %w[k], #1\n"
-                "smull	v15.8h, v0.8b, %[b3].8b\n"
-                "ldr	%q[b0a], [%[b_ptr], #64]\n"
-                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
-                "ldr	%q[b1a], [%[b_ptr], #80]\n"
-                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
-                "ldr 	q0, [%[a_ptr], #64]\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2].8b\n"
-                "ldr	%q[b2a], [%[b_ptr], #96]\n"
-                "smull	v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
-                "ldr	%q[b3a], [%[b_ptr], #112]\n"
-                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
-                "add	%[b_ptr], %[b_ptr], #128\n"
-                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
-                "ldr 	q1, [%[a_ptr], #80]\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "smull	v14.8h, v2.8b, %[b2].8b\n"
-                "smull	v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
-                "ldr 	q2, [%[a_ptr], #96]\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "smull	v14.8h, v3.8b, %[b2].8b\n"
-                "smull	v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
-                "ldr 	%q[b0], [%[b_ptr], #0]\n"
-                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
-                "ldr 	q3, [%[a_ptr], #112]\n"
-
-                // Unroll 1
-                "sadalp	v28.4s, v12.8h\n"
-                "smull	v12.8h, v0.8b, %[b0a].8b\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "smull	v13.8h, v0.8b, %[b1a].8b\n"
-                "sadalp	v31.4s, v15.8h\n"
-                "smull	v14.8h, v0.8b, %[b2a].8b\n"
-                "smull	v15.8h, v0.8b, %[b3a].8b\n"
-                "ldr 	%q[b1], [%[b_ptr], #16]\n"
-                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
-                "ldr 	%q[b2], [%[b_ptr], #32]\n"
-                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
-                "ldr 	q0, [%[a_ptr], #128]\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0a].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1a].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "add	%[a_ptr], %[a_ptr], #128\n"
-                "smull	v14.8h, v1.8b, %[b2a].8b\n"
-                "smull	v15.8h, v1.8b, %[b3a].8b\n"
-                "ldr 	%q[b3], [%[b_ptr], #48]\n"
-                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
-                "ldr 	q1, [%[a_ptr], #16]\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0a].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1a].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "smull	v14.8h, v2.8b, %[b2a].8b\n"
-                "smull	v15.8h, v2.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
-                "ldr 	q2, [%[a_ptr], #32]\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0a].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1a].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "smull	v14.8h, v3.8b, %[b2a].8b\n"
-                "smull	v15.8h, v3.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
-                "ldr 	q3, [%[a_ptr], #48]\n"
-
-                // Start of unroll 0 for next iteration.
-                "sadalp	v28.4s, v12.8h\n"
-                "smull	v12.8h, v0.8b, %[b0].8b\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "smull	v13.8h, v0.8b, %[b1].8b\n"
-                "sadalp	v31.4s, v15.8h\n"
-                "bne	1b\n"
-
-                // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "smull	v14.8h, v0.8b, %[b2].8b\n"
-                "smull	v15.8h, v0.8b, %[b3].8b\n"
-                "ldr	%q[b0a], [%[b_ptr], #64]\n"
-                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
-                "ldr	%q[b1a], [%[b_ptr], #80]\n"
-                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
-                "ldr 	q0, [%[a_ptr], #64]\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2].8b\n"
-                "ldr	%q[b2a], [%[b_ptr], #96]\n"
-                "smull	v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
-                "ldr	%q[b3a], [%[b_ptr], #112]\n"
-                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
-                "add	%[b_ptr], %[b_ptr], #128\n"
-                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
-                "ldr 	q1, [%[a_ptr], #80]\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "smull	v14.8h, v2.8b, %[b2].8b\n"
-                "smull	v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
-                "ldr 	q2, [%[a_ptr], #96]\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "smull	v14.8h, v3.8b, %[b2].8b\n"
-                "smull	v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
-                "ldr 	q3, [%[a_ptr], #112]\n"
-
-                // Unroll 1
-                "sadalp	v28.4s, v12.8h\n"
-                "smull	v12.8h, v0.8b, %[b0a].8b\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "smull	v13.8h, v0.8b, %[b1a].8b\n"
-                "sadalp	v31.4s, v15.8h\n"
-                "smull	v14.8h, v0.8b, %[b2a].8b\n"
-                "add	%[a_ptr], %[a_ptr], #128\n"
-                "smull	v15.8h, v0.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0a].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1a].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2a].8b\n"
-                "smull	v15.8h, v1.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0a].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1a].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smull	v14.8h, v2.8b, %[b2a].8b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "smull	v15.8h, v2.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
-                "str	q16, [%[c_ptr]]\n"
-                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0a].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1a].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smull	v14.8h, v3.8b, %[b2a].8b\n"
-                "addp	v20.4s, v24.4s, v25.4s\n"
-                "addp	v21.4s, v26.4s, v27.4s\n"
-                "smull	v15.8h, v3.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
-                "str	q17, [%[c_ptr], #16]\n"
-                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "smull	v14.8h, v0.8b, %[b2].8b\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "smull	v15.8h, v0.8b, %[b3].8b\n"
-                "add	%[b_ptr], %[b_ptr], #64\n"
-                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2].8b\n"
-                "smull	v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smull	v14.8h, v2.8b, %[b2].8b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "smull	v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
-                "str	q16, [%[c_ptr]]\n"
-                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smull	v14.8h, v3.8b, %[b2].8b\n"
-                "addp	v20.4s, v24.4s, v25.4s\n"
-                "addp	v21.4s, v26.4s, v27.4s\n"
-                "smull	v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
-                "str	q17, [%[c_ptr], #16]\n"
-                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
-
-                "3:\n"
-
-                // Final additions
-                "sadalp	v28.4s, v12.8h\n"
-                "str	q18, [%[c_ptr], #32]\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "sadalp	v31.4s, v15.8h\n"
-
-                // Horizontal reduction, phase 1
-                "addp	v22.4s, v28.4s, v29.4s\n"
-                "addp	v23.4s, v30.4s, v31.4s\n"
-
-                // Horizontal reduction, phase 2
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "str	q19, [%[c_ptr], #48]\n"
-                "add	%[c_ptr], %[c_ptr], #64\n"
-
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
-              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
-              [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
-              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
deleted file mode 100644
index 7eb8b2dacf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "a64_gemm_u16_12x8/generic.hpp"
-
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_u16_12x8 {
-public:
-    typedef uint16_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    gemm_u16_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_u16_asimd_12x8;
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
deleted file mode 100644
index b3f310ce62..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <arm_neon.h>
-
-inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
-{
-  const uint16_t *a_ptr = Apanel;
-  uint32_t *c_ptr = Cpanel;
-
-  for (int yb = 0; yb < ablocks; yb++)
-  {
-    const uint16_t *a_ptr0 = a_ptr;
-    const uint16_t *b_ptr = Bpanel;
-
-    for (int xb = 0; xb < bblocks; xb++)
-    {
-      a_ptr = a_ptr0;
-      const bool odd_k = K & 0x1;
-      int k = (K+1)/2 - 1;
-
-      register uint16x8_t aa asm("v0");
-      register uint16x8_t ab asm("v1");
-      register uint16x8_t b0 asm("v2");
-      register uint16x8_t b1 asm("v3");
-      register uint16x8_t b2 asm("v4");
-
-      __asm __volatile (
-        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
-        "movi v5.4s, #0\n"
-        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
-        "movi v6.4s, #0\n"
-        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
-        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
-        "movi v7.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #64]")
-        "movi v8.4s, #0\n"
-        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
-        "movi v9.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #64]")
-        "movi v10.4s, #0\n"
-        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
-        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
-        "movi v11.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #96]")
-        "movi v12.4s, #0\n"
-        "movi v13.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #96]")
-        "movi v14.4s, #0\n"
-        "movi v15.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #128]")
-        "movi v16.4s, #0\n"
-        "movi v17.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #128]")
-        "movi v18.4s, #0\n"
-        "movi v19.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #160]")
-        "movi v20.4s, #0\n"
-        "movi v21.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #160]")
-        "movi v22.4s, #0\n"
-        "movi v23.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #192]")
-        "movi v24.4s, #0\n"
-        "add %x[a_ptr], %x[a_ptr], #0x10\n"
-        "movi v25.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #192]")
-        "movi v26.4s, #0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x18\n"
-        "movi v27.4s, #0\n"
-        "movi v28.4s, #0\n"
-
-        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
-
-        "1:\n"  // Main loop
-          // First unroll
-          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
-          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          // Second unroll
-          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
-          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
-          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
-          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "add %x[a_ptr], %x[a_ptr], #0x20\n"
-          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          ASM_PREFETCH("[%[b_ptr], #320]")
-          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          ASM_PREFETCH("[%[a_ptr], #320]")
-          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          ASM_PREFETCH("[%[b_ptr], #448]")
-          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "subs %x[k], %x[k], #0x1\n"
-          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
-          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
-          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "add %x[b_ptr], %x[b_ptr], #0x30\n"
-          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "bne 1b\n"
-
-        "2:\n"  // Even tail
-          "cbnz %x[odd_k], 3f\n"
-
-          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "add %[a_ptr], %[a_ptr], #0x10\n"
-          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "add %[b_ptr], %[b_ptr], #0x18\n"
-          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "b 4f\n"  // Complete write out
-
-        "3:\n"  // Odd tail
-          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-
-        "4:\n"  // End of function
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "str q27, [%x[c_ptr], #0x140]\n"
-          "str q12, [%x[c_ptr], #0x150]\n"
-          "str q20, [%x[c_ptr], #0x160]\n"
-          "str q28, [%x[c_ptr], #0x170]\n"
-          "add %x[c_ptr], %x[c_ptr], #0x180\n"
-        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
-          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
-        : [odd_k] "r" (odd_k)
-        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
-      );
-    }
-  }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
deleted file mode 100644
index 62cd747d7c..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_u8_12x8/generic.hpp"
-#include "a64_gemm_u8_12x8/a55r1.hpp"
-
-class gemm_u8_12x8 {
-public:
-    typedef uint8_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 4;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 4;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 4;
-
-    kern_type kernel = nullptr;
-
-    gemm_u8_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_u8_12x8;
-        if (ci->CPU == CPUTarget::A55_DOT) {
-            kernel = a64_gemm_u8_12x8_a55r1;
-        }
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
deleted file mode 100644
index c7c2acbb49..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-    // We divide K by 4 because the udot instruction processes 4 elements at a time.
-    const int W = K/4;
-    // Fix up for odd lengths - set a flag if K is odd, but make
-    // sure we round up the iteration count.
-    const int oddk = (W & 1);
-    const int init_value_k = ((W+1)/2) - 1;
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int k = init_value_k;
-            register int32x4_t a0  asm("v0");
-            register int32x4_t a1  asm("v1");
-            register int32x4_t b0  asm("v2");
-            register int32x4_t b1  asm("v3");
-            register int32x4_t b2  asm("v4");
-            register int32x4_t a0a asm("v5");
-            register int32x4_t a1a asm("v6");
-            __asm __volatile (
-                _DECLARE_UDOT
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "movi	v11.4s, #0x0\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-
-                // Loop proper
-                "1:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "ins    %[a0].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "ins    %[a1].d[1], x20\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "ins    %[b1].d[1], x20\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                // Detached final iteration (even K)
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "ins    %[b2].d[1], x20\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "ins    %[b2].d[1], x20\n"
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-
-
-                ".purgem udot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
-#endif
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
deleted file mode 100644
index 718232fb05..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_UDOT ".altmacro\n"\
-    ".macro udot opd:req, opn:req, opm:req\n"\
-    "local vd, vn, vm, h, l\n"\
-    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
-    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
-    ".set vd,\\reg\n"\
-    ".endif\n"\
-    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
-    ".set vn,\\reg\n"\
-    ".endif\n"\
-    ".irp idx,0,1,2,3\n"\
-    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
-    ".set vm,\\reg\n"\
-    ".set h,\\idx / 2\n"\
-    ".set l,\\idx %% 2\n"\
-    ".endif\n"\
-    ".endr\n"\
-    ".endr\n"\
-    ".ifndef vd\n"\
-    ".error \"Bad operand \\opd\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vn\n"\
-    ".error \"Bad operand \\opn\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vm\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef h\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef l\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".int	 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
-    ".endm\n"\
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
deleted file mode 100644
index 3531eb6d25..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-    // We divide K by 4 because the udot instruction processes 4 elements at a time.
-    const int W = K/4;
-    // Fix up for odd lengths - set a flag if K is odd, but make
-    // sure we round up the iteration count.
-    const int oddk = (W & 1);
-    const int init_value_k = ((W+1)/2) - 1;
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int k = init_value_k;
-            register uint32x4_t a0  asm("v0");
-            register uint32x4_t a1  asm("v1");
-            register uint32x4_t b0  asm("v2");
-            register uint32x4_t b1  asm("v3");
-            register uint32x4_t b2  asm("v4");
-            register uint32x4_t a0a asm("v5");
-            register uint32x4_t a1a asm("v6");
-            __asm __volatile (
-                _DECLARE_UDOT
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                // Loop proper
-                "1:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr	%q[a0], [%[a_ptr], #64]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ldr	%q[a1], [%[a_ptr], #80]\n"
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #112]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-                ".purgem udot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
- 
-        }
-    }
-
-
-}
-#endif 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
deleted file mode 100644
index 3561bfec96..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_u8_4x4/generic.hpp"
-
-class gemm_u8_4x4 {
-public:
-    typedef uint8_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 4;
-    static const int A_block = 16;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 4;
-    static const int B_block = 16;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 4;
-    static const int out_height = 4;
-    static const int k_unroll = 16;
-
-    kern_type kernel = nullptr;
-
-    gemm_u8_4x4(const CPUInfo *ci) {
-        kernel = a64_gemm_u8_4x4;
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
deleted file mode 100644
index aff3faf666..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-    K /= 16;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-
-            int k = K-1;
-
-            register uint8x16_t b0  asm("v4");
-            register uint8x16_t b1  asm("v5");
-            register uint8x16_t b2  asm("v6");
-            register uint8x16_t b3  asm("v7");
-
-            __asm __volatile (
-                "movi	v16.4s, #0x0\n"
-                "ldr	q0, [%[a_ptr]]\n"
-                "movi	v17.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v18.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v19.4s, #0x0\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "movi	v20.4s, #0x0\n"
-                "ldr	%q[b3], [%[b_ptr], #48]\n"
-                "movi	v21.4s, #0x0\n"
-                "ldr	q1, [%[a_ptr], #16]\n"
-                "movi	v22.4s, #0x0\n"
-                "ldr	q2, [%[a_ptr], #32]\n"
-                "movi	v23.4s, #0x0\n"
-                "ldr	q3, [%[a_ptr], #48]\n"
-                "movi	v24.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v25.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v26.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v27.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v28.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v29.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v30.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v31.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-
-                "umull	v12.8h, v0.8b, %[b0].8b\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "umull	v13.8h, v0.8b, %[b1].8b\n"
-                "umull	v14.8h, v0.8b, %[b2].8b\n"
-                "add	%[b_ptr], %[b_ptr], #64\n"
-                "umull	v15.8h, v0.8b, %[b3].8b\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 2f\n"
-
-                "1:\n"
-                "uadalp	v16.4s, v12.8h\n"
-                "umull2	v12.8h, v0.16b, %[b0].16b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull2	v13.8h, v0.16b, %[b1].16b\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull2	v14.8h, v0.16b, %[b2].16b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull2	v15.8h, v0.16b, %[b3].16b\n"
-                "ldr 	q0, [%[a_ptr]]\n"
-
-                "uadalp	v16.4s, v12.8h\n"
-                "umull	v12.8h, v1.8b, %[b0].8b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull	v13.8h, v1.8b, %[b1].8b\n"
-                "subs	%w[k], %w[k], #1\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull	v14.8h, v1.8b, %[b2].8b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull	v15.8h, v1.8b, %[b3].8b\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull2	v12.8h, v1.16b, %[b0].16b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull2	v13.8h, v1.16b, %[b1].16b\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "uadalp	v22.4s, v14.8h\n"
-                "umull2	v14.8h, v1.16b, %[b2].16b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull2	v15.8h, v1.16b, %[b3].16b\n"
-                "ldr 	q1, [%[a_ptr], #16]\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull	v12.8h, v2.8b, %[b0].8b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull	v13.8h, v2.8b, %[b1].8b\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "uadalp	v22.4s, v14.8h\n"
-                "umull	v14.8h, v2.8b, %[b2].8b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull	v15.8h, v2.8b, %[b3].8b\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull2	v12.8h, v2.16b, %[b0].16b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull2	v13.8h, v2.16b, %[b1].16b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull2	v14.8h, v2.16b, %[b2].16b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull2	v15.8h, v2.16b, %[b3].16b\n"
-                "ldr	q2, [%[a_ptr], #32]\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull	v12.8h, v3.8b, %[b0].8b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull	v13.8h, v3.8b, %[b1].8b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull	v14.8h, v3.8b, %[b2].8b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull	v15.8h, v3.8b, %[b3].8b\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "umull2	v12.8h, v3.16b, %[b0].16b\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "umull2	v13.8h, v3.16b, %[b1].16b\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "umull2	v14.8h, v3.16b, %[b2].16b\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "uadalp	v31.4s, v15.8h\n"
-                "umull2	v15.8h, v3.16b, %[b3].16b\n"
-                "ldr	%q[b3], [%[b_ptr], #48]\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "umull	v12.8h, v0.8b, %[b0].8b\n"
-                "add	%[b_ptr], %[b_ptr], #64\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "umull	v13.8h, v0.8b, %[b1].8b\n"
-                "ldr	q3, [%[a_ptr], #48]\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "umull	v14.8h, v0.8b, %[b2].8b\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "uadalp	v31.4s, v15.8h\n"
-                "umull	v15.8h, v0.8b, %[b3].8b\n"
-                "bne	1b\n"
-
-                // Branch target
-                "2:\n"
-                "uadalp	v16.4s, v12.8h\n"
-                "umull2	v12.8h, v0.16b, %[b0].16b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull2	v13.8h, v0.16b, %[b1].16b\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull2	v14.8h, v0.16b, %[b2].16b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull2	v15.8h, v0.16b, %[b3].16b\n"
-
-                "uadalp	v16.4s, v12.8h\n"
-                "umull	v12.8h, v1.8b, %[b0].8b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull	v13.8h, v1.8b, %[b1].8b\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull	v14.8h, v1.8b, %[b2].8b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull	v15.8h, v1.8b, %[b3].8b\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull2	v12.8h, v1.16b, %[b0].16b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull2	v13.8h, v1.16b, %[b1].16b\n"
-                "uadalp	v22.4s, v14.8h\n"
-                "umull2	v14.8h, v1.16b, %[b2].16b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull2	v15.8h, v1.16b, %[b3].16b\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull	v12.8h, v2.8b, %[b0].8b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull	v13.8h, v2.8b, %[b1].8b\n"
-                "uadalp	v22.4s, v14.8h\n"
-                "umull	v14.8h, v2.8b, %[b2].8b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull	v15.8h, v2.8b, %[b3].8b\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull2	v12.8h, v2.16b, %[b0].16b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull2	v13.8h, v2.16b, %[b1].16b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull2	v14.8h, v2.16b, %[b2].16b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull2	v15.8h, v2.16b, %[b3].16b\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull	v12.8h, v3.8b, %[b0].8b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull	v13.8h, v3.8b, %[b1].8b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull	v14.8h, v3.8b, %[b2].8b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull	v15.8h, v3.8b, %[b3].8b\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "umull2	v12.8h, v3.16b, %[b0].16b\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "umull2	v13.8h, v3.16b, %[b1].16b\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "umull2	v14.8h, v3.16b, %[b2].16b\n"
-                "uadalp	v31.4s, v15.8h\n"
-                "umull2	v15.8h, v3.16b, %[b3].16b\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "uadalp	v31.4s, v15.8h\n"
-
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "addp	v20.4s, v24.4s, v25.4s\n"
-                "addp	v21.4s, v26.4s, v27.4s\n"
-                "addp	v22.4s, v28.4s, v29.4s\n"
-                "addp	v23.4s, v30.4s, v31.4s\n"
-
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-
-                "str	q16, [%[c_ptr]]\n"
-                "str	q17, [%[c_ptr], #16]\n"
-                "str	q18, [%[c_ptr], #32]\n"
-                "str	q19, [%[c_ptr], #48]\n"
-                "add	%[c_ptr], %[c_ptr], #64\n"
-
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
-              [k] "+r" (k)
-            :
-            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
-              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
deleted file mode 100644
index 5e7684f692..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-// Get the components we need to implement SGEMM.
-// Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time.
-#include "a64_hgemm_24x8/generic.hpp"
-#include "a64_hgemm_24x8/a55r1.hpp"
-
-// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
-//
-// The generic "gemm_opt" function will instantiate one of these (allowing
-// the constructor to pick a kernel implementation).
-class hgemm_24x8 {
-public:
-    typedef __fp16 operand_type;
-    typedef __fp16 result_type;
-
-    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-
-    static const int A_block = 1;
-    static const int A_interleave = 8;
-    static const bool A_transpose = false;
-
-    static const int B_block = 1;
-    static const int B_interleave = 24;
-    static const bool B_transpose = true;
-
-    static const int out_width = 24;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    hgemm_24x8(const struct CPUInfo *ci) {
-        kernel = a64_hgemm_asimd_24x8;
-        if (ci->CPU == CPUTarget::A55_DOT) {
-            kernel = a64_hgemm_asimd_24x8_a55r1;
-        }
-    }
-
-};
-
-#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp
deleted file mode 100644
index 1789abb046..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (c) 201 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-    const __fp16 *a_ptr = Apanel;
-    __fp16 *c_ptr = Cpanel;
-
-    // Fix up for odd lengths - set a flag if K is odd, but make
-    // sure we round up the iteration count.
-    int oddk = (K & 1);
-    int k_iters = ((K+1)/2) - 1;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            int k = k_iters;
-            a_ptr = a_ptr0;
-
-            // As A55 requires 64-bit loads anyway, just use 64 bits of the
-            // "A" operands to save on "ins" instructions.  Since A55 is
-            // in-order, two sets of "A" operands and one set of "B" is
-            // sufficient.
-            register float16x8_t a0  asm("v0");
-            register float16x8_t a1  asm("v1");
-            register float16x8_t a0a asm("v2");
-            register float16x8_t a1a asm("v3");
-            register float16x8_t b0  asm("v4");
-            register float16x8_t b1  asm("v5");
-            register float16x8_t b2  asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.8h, #0x0\n"
-                "ldr	%d[a0], [%[a_ptr]]\n"
-                "movi	v9.8h, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.8h, #0x0\n"
-                "ldr	%d[a1], [%[a_ptr], #8]\n"
-                "movi	v11.8h, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.8h, #0x0\n"
-                "movi	v13.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v14.8h, #0x0\n"
-                "movi	v15.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v16.8h, #0x0\n"
-                "movi	v17.8h, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v18.8h, #0x0\n"
-                "movi	v19.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v20.8h, #0x0\n"
-                "movi	v21.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v22.8h, #0x0\n"
-                "movi	v23.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v24.8h, #0x0\n"
-                "movi	v25.8h, #0x0\n"
-                "movi	v26.8h, #0x0\n"
-                "movi	v27.8h, #0x0\n"
-                "movi	v28.8h, #0x0\n"
-                "movi	v29.8h, #0x0\n"
-                "movi	v30.8h, #0x0\n"
-                "movi	v31.8h, #0x0\n"
-
-                // The loop is offset by these two instructions which must
-                // always be executed.
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #16]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
-                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #24]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
-                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
-                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                // Unroll 1
-                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #32]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a1a].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1a].h[1]\n"
-                "fmla	v14.8h, %[b0].8h, %[a1a].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1a].h[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #40]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
-                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v20.8h, %[b1].8h, %[a1a].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v22.8h, %[b1].8h, %[a1a].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1a].h[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
-
-                "fmla	v28.8h, %[b2].8h, %[a1a].h[0]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "fmla	v29.8h, %[b2].8h, %[a1a].h[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v30.8h, %[b2].8h, %[a1a].h[2]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v31.8h, %[b2].8h, %[a1a].h[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "bne	1b\n"
-
-                "4:\n"
-
-                // Start final iteration - branch off to "odd" code before we load a0a
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "cbnz	%w[oddk], 2f\n"
-
-                // Even K continuation
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #16]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr]]")
-                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #24]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #128]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
-                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
-                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #256]")
-
-                "fmla 	v12.8h, %[b0].8h, %[a1a].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1a].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla	v14.8h, %[b0].8h, %[a1a].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1a].h[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #40]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #448]")
-
-                "fmla	v20.8h, %[b1].8h, %[a1a].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1a].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #512]")
-                "fmla	v22.8h, %[b1].8h, %[a1a].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1a].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #576]")
-
-                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]")
-                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #704]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1a].h[0]\n"
-                "fmla	v29.8h, %[b2].8h, %[a1a].h[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v30.8h, %[b2].8h, %[a1a].h[2]\n"
-                "fmla	v31.8h, %[b2].8h, %[a1a].h[3]\n"
-                "b	3f\n"
-
-                "2:\n"
-
-                // Odd tail
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr]]")
-
-                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "add	%[a_ptr], %[a_ptr], #16\n"
-                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #128]")
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #256]")
-
-                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #384]")
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #448]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #512]")
-                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #576]")
-                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]")
-                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #704]")
-
-                // Common tail
-                // A55 won't dual issue these stores with anything else, so
-                // simplest to do them all in this common code.
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "5:\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "=w" (a0), [a0a] "=w" (a0a), [a1] "=w" (a1), [a1a] "=w" (a1a),
-              [b0] "=w" (b0), [b1] "=w" (b1), [b2] "=w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
deleted file mode 100644
index 03e2bb95a3..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-    const __fp16 *a_ptr = Apanel;
-    __fp16 *c_ptr = Cpanel;
-    for (int yb=0; yb<ablocks; yb++) {
-        const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-            register float16x8_t a0  asm("v0");
-            register float16x8_t a0a asm("v1");
-            register float16x8_t b0  asm("v2");
-            register float16x8_t b1  asm("v3");
-            register float16x8_t b2  asm("v4");
-            register float16x8_t b0a asm("v5");
-            register float16x8_t b1a asm("v6");
-            register float16x8_t b2a asm("v7");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.8h, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.8h, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.8h, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v11.8h, #0x0\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "movi	v12.8h, #0x0\n"
-                "ldr	%q[b0a], [%[b_ptr], #48]\n"
-                "movi	v13.8h, #0x0\n"
-                "ldr	%q[b1a], [%[b_ptr], #64]\n"
-                "movi	v14.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v15.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v16.8h, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v17.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v18.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v19.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.8h, #0x0\n"
-                "movi	v21.8h, #0x0\n"
-                "movi	v22.8h, #0x0\n"
-                "movi	v23.8h, #0x0\n"
-                "movi	v24.8h, #0x0\n"
-                "movi	v25.8h, #0x0\n"
-                "movi	v26.8h, #0x0\n"
-                "movi	v27.8h, #0x0\n"
-                "movi	v28.8h, #0x0\n"
-                "movi	v29.8h, #0x0\n"
-                "movi	v30.8h, #0x0\n"
-                "movi	v31.8h, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	%q[a0a], [%[a_ptr], #16]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%q[b2a], [%[b_ptr], #80]\n"
-                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "fmla	v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #288]")
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
-                "ldr	%q[a0], [%[a_ptr], #32]\n"
-
-                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
-                "fmla	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
-                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
-                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
-                "fmla	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
-                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
-                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
-                "ldr	%q[b0a], [%[b_ptr], #48]\n"
-
-                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
-                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #352]")
-                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
-                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
-                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
-                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
-                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
-                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
-                "ldr	%q[b1a], [%[b_ptr], #64]\n"
-
-                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
-                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
-                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
-                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
-                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
-                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-
-                "bne	1b\n"
-                "4:\n"
-
-                // Jump to odd tail if necessary.
-                "cbnz	%w[oddk], 2f\n"
-
-                // Even tail.
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	%q[a0a], [%[a_ptr], #16]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%q[b2a], [%[b_ptr], #80]\n"
-                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
-
-                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
-                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
-                "str	q8, [%[c_ptr]]\n"
-                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-
-                "fmla  	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-
-                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-
-                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-
-                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-
-                "fmla  	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-
-                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-
-                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-                "b	3f\n"
-
-                // Odd tail
-                "2:\n"
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "add	%[a_ptr], %[a_ptr], #16\n"
-                "str	q8, [%[c_ptr]]\n"
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-
-                "fmla  	v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-
-                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-
-                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
-
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a0a] "+w" (a0a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k),
-              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
deleted file mode 100644
index 603ad8dc0a..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "a64_sgemm_12x8/generic.hpp"
-#include "a64_sgemm_12x8/a53.hpp"
-#include "a64_sgemm_12x8/a55.hpp"
-#include "a64_sgemm_12x8/a55r1.hpp"
-
-
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class sgemm_12x8 {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel{nullptr};
-
-    sgemm_12x8(const CPUInfo *ci) {
-        kernel = a64_sgemm_asimd_12x8;
-        if (ci->CPU == CPUTarget::A53) {
-            kernel = a64_sgemm_asimd_12x8_a53;
-        }
-        else if (ci->CPU == CPUTarget::A55) {
-            kernel = a64_sgemm_asimd_12x8_a55;
-        }
-        else if (ci->CPU == CPUTarget::A55_DOT) {
-            kernel = a64_sgemm_asimd_12x8_a55r1;
-        }
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
deleted file mode 100644
index 1c9b4b38fc..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                // Unroll 0
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "nop\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "nop\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Unroll 1
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "ldr	x20, [%[a_ptr], #72]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-                "ins	%[a0].d[1], x20\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "ldr	x20, [%[a_ptr], #88]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-                "ins	%[a1].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-
-                "nop\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-
-                "nop\n"
-                "nop\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-                "nop\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-
-                "nop\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-
-                "bne	1b\n"
-
-                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
-                "4:\n"
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration. (even K)
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-
-                "nop\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "nop\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b	3f\n"
-
-                // Detached final iteration. (odd K)
-                "2:\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16,  [%[c_ptr], #16]\n"
-                "str	q24,  [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17,  [%[c_ptr], #64]\n"
-                "str	q25,  [%[c_ptr], #80]\n"
-                "str	q10,  [%[c_ptr], #96]\n"
-                "str	q18,  [%[c_ptr], #112]\n"
-                "str	q26,  [%[c_ptr], #128]\n"
-                "str	q11,  [%[c_ptr], #144]\n"
-                "str	q19,  [%[c_ptr], #160]\n"
-                "str	q27,  [%[c_ptr], #176]\n"
-                "str	q12,  [%[c_ptr], #192]\n"
-                "str	q20,  [%[c_ptr], #208]\n"
-                "str	q28,  [%[c_ptr], #224]\n"
-                "str	q13,  [%[c_ptr], #240]\n"
-                "str	q21,  [%[c_ptr], #256]\n"
-                "str	q29,  [%[c_ptr], #272]\n"
-                "str	q14,  [%[c_ptr], #288]\n"
-                "str	q22,  [%[c_ptr], #304]\n"
-                "str	q30,  [%[c_ptr], #320]\n"
-                "str	q15,  [%[c_ptr], #336]\n"
-                "str	q23,  [%[c_ptr], #352]\n"
-                "str	q31,  [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
-                
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
deleted file mode 100644
index 85d8a502f8..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                // Unroll 0
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "subs	%w[k], %w[k], #1\n"
-
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "ins	%[b0].d[1], x20\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Unroll 1
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "ins	%[b1].d[1], x20\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #72]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-                "ins	%[a0].d[1], x20\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #88]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-
-
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-                "ins	%[a1].d[1], x20\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-                "ins	%[b0].d[1], x20\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-
-
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "ins	%[b1].d[1], x20\n"
-
-
-                "bne	1b\n"
-
-                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
-                "4:\n"
-                "cbnz	%w[oddk], 2f\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                // Detached final iteration. (even K)
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "ins	%[b0].d[1], x20\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "ins	%[b1].d[1], x20\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b	3f\n"
-
-                // Detached final iteration. (odd K)
-                "2:\n"
-
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16,  [%[c_ptr], #16]\n"
-                "str	q24,  [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17,  [%[c_ptr], #64]\n"
-                "str	q25,  [%[c_ptr], #80]\n"
-                "str	q10,  [%[c_ptr], #96]\n"
-                "str	q18,  [%[c_ptr], #112]\n"
-                "str	q26,  [%[c_ptr], #128]\n"
-                "str	q11,  [%[c_ptr], #144]\n"
-                "str	q19,  [%[c_ptr], #160]\n"
-                "str	q27,  [%[c_ptr], #176]\n"
-                "str	q12,  [%[c_ptr], #192]\n"
-                "str	q20,  [%[c_ptr], #208]\n"
-                "str	q28,  [%[c_ptr], #224]\n"
-                "str	q13,  [%[c_ptr], #240]\n"
-                "str	q21,  [%[c_ptr], #256]\n"
-                "str	q29,  [%[c_ptr], #272]\n"
-                "str	q14,  [%[c_ptr], #288]\n"
-                "str	q22,  [%[c_ptr], #304]\n"
-                "str	q30,  [%[c_ptr], #320]\n"
-                "str	q15,  [%[c_ptr], #336]\n"
-                "str	q23,  [%[c_ptr], #352]\n"
-                "str	q31,  [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
deleted file mode 100644
index 295308053f..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                ASM_PREFETCH("[%[b_ptr], #256]")
-
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                ASM_PREFETCH("[%[a_ptr], #384]")
-
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                ASM_PREFETCH("[%[b_ptr], #512]")
-
-                "movi	v8.4s, #0x0\n"
-                "movi	v9.4s, #0x0\n"
-                "movi	v10.4s, #0x0\n"
-                "movi	v11.4s, #0x0\n"
-                "movi	v12.4s, #0x0\n"
-                "movi	v13.4s, #0x0\n"
-                "movi	v14.4s, #0x0\n"
-                "movi	v15.4s, #0x0\n"
-                "movi	v16.4s, #0x0\n"
-                "movi	v17.4s, #0x0\n"
-
-                "movi	v18.4s, #0x0\n"
-                "movi	v19.4s, #0x0\n"
-                "movi	v20.4s, #0x0\n"
-                "movi	v21.4s, #0x0\n"
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                // Unroll 0
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                ASM_PREFETCH("[%[a_ptr], #448]")
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #576]")
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Unroll 1
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #72]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "ins	%[a0].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #88]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "ins	%[a1].d[1], x20\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                ASM_PREFETCH("[%[b_ptr], #640]")
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-
-                "bne	1b\n"
-
-                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
-                "4:\n"
-                "cbnz	%w[oddk], 2f\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                // Detached final iteration. (even K)
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                 
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b	3f\n"
-
-                // Detached final iteration. (odd K)
-                "2:\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16,  [%[c_ptr], #16]\n"
-                "str	q24,  [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17,  [%[c_ptr], #64]\n"
-                "str	q25,  [%[c_ptr], #80]\n"
-                "str	q10,  [%[c_ptr], #96]\n"
-                "str	q18,  [%[c_ptr], #112]\n"
-                "str	q26,  [%[c_ptr], #128]\n"
-                "str	q11,  [%[c_ptr], #144]\n"
-                "str	q19,  [%[c_ptr], #160]\n"
-                "str	q27,  [%[c_ptr], #176]\n"
-                "str	q12,  [%[c_ptr], #192]\n"
-                "str	q20,  [%[c_ptr], #208]\n"
-                "str	q28,  [%[c_ptr], #224]\n"
-                "str	q13,  [%[c_ptr], #240]\n"
-                "str	q21,  [%[c_ptr], #256]\n"
-                "str	q29,  [%[c_ptr], #272]\n"
-                "str	q14,  [%[c_ptr], #288]\n"
-                "str	q22,  [%[c_ptr], #304]\n"
-                "str	q30,  [%[c_ptr], #320]\n"
-                "str	q15,  [%[c_ptr], #336]\n"
-                "str	q23,  [%[c_ptr], #352]\n"
-                "str	q31,  [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
deleted file mode 100644
index c4a5875a31..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                // Loop proper
-                "1:\n"
-                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "fmla  	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	%q[a0], [%[a_ptr], #64]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "ldr	%q[a1], [%[a_ptr], #80]\n"
-                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #112]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
-
-inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp
deleted file mode 100644
index 2a39ca1f07..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "generic.hpp"
-
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int k_unroll = 1;
-
-    kern_type kernel;
-
-    sgemv_trans(const CPUInfo *ci) {
-        kernel = a64_sgemv_trans;
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp
deleted file mode 100644
index 33f2b701cf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp
+++ /dev/null
@@ -1,913 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-// Kernel implementation - transposed GEMV
-//
-// The kernel will process "M" rows of A (= steps of dot product) and "N"
-// columns (= dot products total)
-//
-// General plan is to do as many columns simultaneously as possible - a
-// reasonable limit is half the NEON regfile = 64 total accumulators.
-//
-// It's possible that messing around with sub-blocking M and N can yield
-// higher performance, but that's left to the outer loop.  In this kernel we
-// process all of M at the same time.
-
-
-// How far ahead to prefetch for the first and subsequent prefetches.
-// These values work for A72 on JunoR2...
-
-#define FIRST_PFD 9
-#define PFD 6
-
-inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) {
-    const float *a_ptr_base = Astart;
-    float *y_ptr = Ystart;
-
-    register const float32x4_t va asm("v1") = vdupq_n_f32(alpha);
-
-    int firstpfd=FIRST_PFD;
-    if (firstpfd > M) {
-        firstpfd = (M-1);
-    }
-
-    int pfd = PFD;
-    if (pfd > M) {
-        pfd = (M-1);
-    }
-
-    ptrdiff_t jump = lda * sizeof(int);
-
-    for (;N>=96;N-=96) {
-        int k = M-1;
-
-        const float *a_ptr = a_ptr_base;
-        const float *x_ptr = Xstart;
-        const float *pf_ptr = a_ptr;
-        const float *firstpf_ptr = a_ptr;
-        const float *pf_limit = a_ptr + (M * lda);
-
-        for (int i=0; i<firstpfd; i++) {
-            prefetch_1x(firstpf_ptr);
-            firstpf_ptr += lda;
-        }
-
-        for (int i=0; i<pfd; i++) {
-            prefetch_5x(pf_ptr + 16);
-            pf_ptr += lda;
-        }
-
-        a_ptr_base += 96;
-
-        __asm __volatile (
-            "movi	v8.4s,#0x0\n"
-            "ldr	w0, [%[x_ptr]]\n"
-            "movi	v9.4s,#0x0\n"
-            "ldr	q2,  [%[a_ptr], #0]\n"
-            "movi	v10.4s,#0x0\n"
-            "ldr	q3,  [%[a_ptr], #0x10]\n"
-            "movi	v11.4s,#0x0\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "movi	v12.4s,#0x0\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            "movi	v13.4s,#0x0\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "movi	v14.4s,#0x0\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "movi	v15.4s,#0x0\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]")
-            "movi	v16.4s, #0x0\n"
-            "movi	v17.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #64]")
-            "movi	v18.4s, #0x0\n"
-            "movi	v19.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #128]")
-            "movi	v20.4s, #0x0\n"
-            "movi	v21.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #192]")
-            "movi	v22.4s, #0x0\n"
-            "movi	v23.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #256]")
-            "movi	v24.4s, #0x0\n"
-            "movi	v25.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #320]")
-            "movi	v26.4s, #0x0\n"
-            "movi	v27.4s, #0x0\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "movi	v28.4s, #0x0\n"
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "movi	v29.4s, #0x0\n"
-            "movi	v30.4s, #0x0\n"
-            "movi	v31.4s, #0x0\n"
-
-            // Skip everything if there are no iterations of the main loop to do.
-            "cbz	%w[k], 10f\n"
-
-            // Loop with all prefetches.  Exit this loop when firstpf_ptr
-            // hits pf_limit.
-            "1:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]")
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "sub	%w[k], %w[k], #1\n"
-            ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "cmp	%[firstpf_ptr], %[pf_limit]\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "blt	1b\n"
-
-            // Check that there are still "main" prefetches to do.
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "bge	9f\n"
-
-            // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
-            "8:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "sub	%w[k], %w[k], #1\n"
-            ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "blt	8b\n"
-
-            // Check that there is still work to do.
-            "9:\n"
-            "cmp	%w[k], #0\n"
-            "beq	10f\n"
-
-            // Loop without prefetches, exit when k hits 0.
-            "2:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "subs	%w[k], %w[k], #1\n"
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "bne	2b\n"
-
-            "10:\n"
-
-            // Final iteration
-            "dup	v0.4s, w0\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2,  [%[y_ptr]]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3,  [%[y_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4,  [%[y_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5,  [%[y_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "ldr	q6,  [%[y_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "ldr	q7,  [%[y_ptr], #0x50]\n"
-
-            "fmla	v2.4s, v8.4s, %[va].4s\n"
-            "ldr	q8, [%[y_ptr], #0x60]\n"
-            "fmla	v3.4s, v9.4s, %[va].4s\n"
-            "ldr	q9, [%[y_ptr], #0x70]\n"
-            "fmla	v4.4s, v10.4s, %[va].4s\n"
-            "ldr	q10, [%[y_ptr], #0x80]\n"
-            "fmla	v5.4s, v11.4s, %[va].4s\n"
-            "ldr	q11, [%[y_ptr], #0x90]\n"
-            "fmla	v6.4s, v12.4s, %[va].4s\n"
-            "ldr	q12, [%[y_ptr], #0xa0]\n"
-            "str	q2, [%[y_ptr], #0x00]\n"
-            "fmla	v7.4s, v13.4s, %[va].4s\n"
-            "ldr	q13, [%[y_ptr], #0xb0]\n"
-            "str	q3, [%[y_ptr], #0x10]\n"
-            "fmla	v8.4s, v14.4s, %[va].4s\n"
-            "ldr	q14, [%[y_ptr], #0xc0]\n"
-            "str	q4, [%[y_ptr], #0x20]\n"
-            "fmla	v9.4s, v15.4s, %[va].4s\n"
-            "ldr	q15, [%[y_ptr], #0xd0]\n"
-            "str	q5, [%[y_ptr], #0x30]\n"
-            "fmla	v10.4s, v16.4s, %[va].4s\n"
-            "ldr	q16, [%[y_ptr], #0xe0]\n"
-            "str	q6, [%[y_ptr], #0x40]\n"
-            "fmla	v11.4s, v17.4s, %[va].4s\n"
-            "ldr	q17, [%[y_ptr], #0xf0]\n"
-            "str	q7, [%[y_ptr], #0x50]\n"
-            "fmla	v12.4s, v18.4s, %[va].4s\n"
-            "ldr	q18, [%[y_ptr], #0x100]\n"
-            "str	q8, [%[y_ptr], #0x60]\n"
-            "fmla	v13.4s, v19.4s, %[va].4s\n"
-            "ldr	q19, [%[y_ptr], #0x110]\n"
-            "str	q9, [%[y_ptr], #0x70]\n"
-            "fmla	v14.4s, v20.4s, %[va].4s\n"
-            "ldr	q20, [%[y_ptr], #0x120]\n"
-            "str	q10, [%[y_ptr], #0x80]\n"
-            "fmla	v15.4s, v21.4s, %[va].4s\n"
-            "ldr	q21, [%[y_ptr], #0x130]\n"
-            "str	q11, [%[y_ptr], #0x90]\n"
-            "fmla	v16.4s, v22.4s, %[va].4s\n"
-            "ldr	q22, [%[y_ptr], #0x140]\n"
-            "str	q12, [%[y_ptr], #0xa0]\n"
-            "fmla	v17.4s, v23.4s, %[va].4s\n"
-            "ldr	q23, [%[y_ptr], #0x150]\n"
-            "str	q13, [%[y_ptr], #0xb0]\n"
-            "fmla	v18.4s, v24.4s, %[va].4s\n"
-            "ldr	q24, [%[y_ptr], #0x160]\n"
-            "str	q14, [%[y_ptr], #0xc0]\n"
-            "fmla	v19.4s, v25.4s, %[va].4s\n"
-            "ldr	q25, [%[y_ptr], #0x170]\n"
-            "str	q15, [%[y_ptr], #0xd0]\n"
-            "fmla	v20.4s, v26.4s, %[va].4s\n"
-            "str	q16, [%[y_ptr], #0xe0]\n"
-            "fmla	v21.4s, v27.4s, %[va].4s\n"
-            "str	q17, [%[y_ptr], #0xf0]\n"
-            "fmla	v22.4s, v28.4s, %[va].4s\n"
-            "str	q18, [%[y_ptr], #0x100]\n"
-            "fmla	v23.4s, v29.4s, %[va].4s\n"
-            "str	q19, [%[y_ptr], #0x110]\n"
-            "fmla	v24.4s, v30.4s, %[va].4s\n"
-            "str	q20, [%[y_ptr], #0x120]\n"
-            "fmla	v25.4s, v31.4s, %[va].4s\n"
-            "str	q21, [%[y_ptr], #0x130]\n"
-
-            "stp	q22, q23, [%[y_ptr], #0x140]\n"
-            "stp	q24, q25, [%[y_ptr], #0x160]\n"
-            "add	%[y_ptr], %[y_ptr], #0x180\n"
-
-          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr)
-          : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit)
-          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
-            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc"
-        );
-    }
-
-    if (N>0) {
-        // Handle N tail - up to 95 stragglers.
-        // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
-        // single value for the remainder.
-
-        // Independent pointers into the matrix for the odd 2 and odd 1.
-        // Double up as flag to indicate whether they are needed.
-        const float *odd2_aptr=NULL;
-        const float *odd1_aptr=NULL;
-
-        // Figure out how much work we need to do.
-        int numvecs = N/4;
-        int rem = N%4;
-        int k=M;
-
-        // Set up pointers for the odd 2/1 if needed.
-        if (rem >= 2) {
-            odd2_aptr = a_ptr_base + (numvecs * 4);
-        }
-
-        if (rem & 1) {
-            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2);
-        }
-
-        const float *a_ptr = a_ptr_base;
-        const float *firstpf_ptr = a_ptr_base;
-        const float *pf_ptr = a_ptr_base;
-        const float *pf_limit = a_ptr + (M * lda);
-
-        const float *x_ptr = Xstart;
-        int vecs=0; // Working variable to count how many vectors to work on.
-        int dopf=1; // Track whether we are doing prefetches.
-
-        // Figure out how many cache lines we need to prefetch each time.
-        int numpfs = (N + 15) / 16;
-
-        // Do initial prefetches
-        for (int i=0; i<firstpfd+1; i++) {
-            prefetch_1x(firstpf_ptr);
-            firstpf_ptr += lda;
-        }
-
-        // Do "main" prefetches - adapt number to the number we actually need.
-        if (numpfs > 1) {
-            for (int i=0; i<pfd+1; i++) {
-                switch (numpfs) {
-                    case 2:
-                        prefetch_1x(pf_ptr + 16);
-                        break;
-
-                    case 3:
-                        prefetch_2x(pf_ptr + 16);
-                        break;
-
-                    case 4:
-                        prefetch_3x(pf_ptr + 16);
-                        break;
-
-                    case 5:
-                        prefetch_4x(pf_ptr + 16);
-                        break;
-
-                    case 6:
-                        prefetch_5x(pf_ptr + 16);
-                        break;
-                }
-                pf_ptr += lda;
-            }
-        } else {
-            // Just disable additional prefetches
-            dopf=0;
-        }
-
-        // Do the real work
-        __asm __volatile (
-            // Initialize all the vectors - not worth skipping this if only
-            // some are needed.
-            "movi	v8.4s,#0x0\n"
-            "ldr	w0, [%[x_ptr]]\n"
-            "movi	v9.4s,#0x0\n"
-            "movi	v10.4s,#0x0\n"
-            "movi	v11.4s,#0x0\n"
-            "movi	v12.4s,#0x0\n"
-            "movi	v13.4s,#0x0\n"
-            "movi	v14.4s,#0x0\n"
-            "movi	v15.4s,#0x0\n"
-            "movi	v16.4s, #0x0\n"
-            "movi	v17.4s, #0x0\n"
-            "movi	v18.4s, #0x0\n"
-            "movi	v19.4s, #0x0\n"
-            "movi	v20.4s, #0x0\n"
-            "movi	v21.4s, #0x0\n"
-            "movi	v22.4s, #0x0\n"
-            "movi	v23.4s, #0x0\n"
-            "movi	v24.4s, #0x0\n"
-            "movi	v25.4s, #0x0\n"
-            "movi	v26.4s, #0x0\n"
-            "movi	v27.4s, #0x0\n"
-            "movi	v28.4s, #0x0\n"
-            "movi	v29.4s, #0x0\n"
-            "movi	v30.4s, #0x0\n"
-            "movi	v6.2s, #0x0\n"
-            "movi	v5.2s, #0x0\n"
-
-            "1:\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]\n")
-            "11:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #4\n"
-
-            "cbz	%w[numvecs], 2f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x00]\n"
-            "fmla	v8.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x10]\n"
-            "fmla	v9.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x20]\n"
-            "fmla	v10.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x30]\n"
-            "fmla	v11.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 3f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "3:\n"
-            "beq	2f\n"
-
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x40]\n"
-            "fmla	v12.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x50]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x60]\n"
-            "fmla	v14.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x70]\n"
-            "fmla	v15.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 4f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "4:\n"
-            "beq	2f\n"
-
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x80]\n"
-            "fmla	v16.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x90]\n"
-            "fmla	v17.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xa0]\n"
-            "fmla	v18.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xb0]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 5f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "5:\n"
-            "beq	2f\n"
-
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xc0]\n"
-            "fmla	v20.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xd0]\n"
-            "fmla	v21.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xe0]\n"
-            "fmla	v22.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xf0]\n"
-            "fmla	v23.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 6f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "6:\n"
-            "beq	2f\n"
-
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x100]\n"
-            "fmla	v24.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x110]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x120]\n"
-            "fmla	v26.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x130]\n"
-            "fmla	v27.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 7f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "7:\n"
-            "beq	2f\n"
-
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x140]\n"
-            "fmla	v28.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x150]\n"
-            "fmla	v29.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x160]\n"
-            "fmla	v30.4s, v7.4s, v0.4s\n"
-
-            "2:\n"
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-
-            // Do the odd 2-vector, if needed
-            "cbz	%[odd2_aptr], 8f\n"
-            "ldr	d7, [%[odd2_aptr]]\n"
-            "fmla	v6.2s, v7.2s, v0.2s\n"
-            "add	%[odd2_aptr], %[odd2_aptr], %[jump]\n"
-
-            "8:\n"
-            // Do the odd 1-vector, if needed
-            "cbz	%[odd1_aptr], 9f\n"
-            "ldr	s7, [%[odd1_aptr]]\n"
-            "fmla	v5.2s, v7.2s, v0.2s\n"
-            "add	%[odd1_aptr], %[odd1_aptr], %[jump]\n"
-
-            // Get out if needed.
-            "9:\n"
-            "subs	%w[k], %w[k], #1\n"
-            "beq	10f\n"
-
-            // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "csel	%w[dopf], %w[dopf], WZR, LT\n"
-
-            // Update the "leading" prefetch pointer, don't do the first
-            // instruction of the loop if it's over the limit.
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "cmp	%[firstpf_ptr], %[pf_limit]\n"
-            "blt	1b\n"
-            "b		11b\n"
-
-            // Now write out the outputs
-            "10:\n"
-            "cbz	%w[numvecs], 12f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v8.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v9.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v10.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v11.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v12.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v13.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v14.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v15.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v16.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v17.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v18.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v19.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v20.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v21.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v22.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v23.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v24.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v25.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v26.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v27.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v28.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v29.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v30.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-
-            // Odd 2
-            "12:\n"
-            "cbz	%[odd2_aptr], 13f\n"
-            "ldr	d7, [%[y_ptr]]\n"
-            "fmla	v7.2s, v6.2s, %[va].2s\n"
-            "str	d7, [%[y_ptr]], #0x8\n"
-
-            // Odd 1
-            "13:\n"
-            "cbz	%[odd1_aptr], 14f\n"
-            "ldr	s7, [%[y_ptr]]\n"
-            "fmla	v7.2s, v5.2s, %[va].2s\n"
-            "str	s7, [%[y_ptr]]\n"
-
-            "14:\n"
-          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k),
-            [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr),
-            [odd1_aptr] "+r" (odd1_aptr), [odd2_aptr] "+r" (odd2_aptr),
-            [dopf] "+r" (dopf), [vecs] "+r" (vecs)
-          : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit), [numvecs] "r" (numvecs)
-          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
-            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc"
-        );
-    }
-}