COMPMID-881: RSH new arm_gemm interface.

Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Pablo Tello <pablo.tello@arm.com> 2018-02-23 13:43:50 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:49:16 +0000
commit: eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0 (patch)
tree: 42cca378eed97c07348f28e1ec708d9c7ed531ce /arm_compute
parent: 8df6c452820719d201ee79596cde8445c2071db5 (diff)
download: ComputeLibrary-eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0.tar.gz
67 files changed, 798 insertions, 10953 deletions
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 5c15e5ecc4..7ec74eaccd 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -113,13 +113,5 @@
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 
 #endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
deleted file mode 100644
index 4868f83d74..0000000000
--- a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch32/armv7a NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMAArch32Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAArch32Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
deleted file mode 100644
index 5252378db7..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAArch64Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h
deleted file mode 100644
index ba78aae9f4..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__
-#define __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Native AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMAArch64NativeKernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAArch64NativeKernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
deleted file mode 100644
index 83c209d48f..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64A53Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpAArch64A53Kernel";
-    }
-    /** Default constructor */
-    NEGEMMLowpAArch64A53Kernel();
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-
-private:
-    using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
-                                      const Window     &window,
-                                      const ThreadInfo &info);
-    NEGEMMLowpAArch64A53 *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
-#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
deleted file mode 100644
index f813242fc9..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpAArch64Kernel";
-    }
-    /** Default constructor */
-    NEGEMMLowpAArch64Kernel();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-
-private:
-    using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
-                                   bool is_transposed_1, const Window &window,
-                                   const ThreadInfo &info);
-    NEGEMMLowpAArch64 *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
-#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
deleted file mode 100644
index b854d3a9aa..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-// Enable only if compiled for AArch64-V8.2-A targets
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpAArch64V8P4Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel
-     *
-     * The computed function is C = a * AxB + b * C.
-     *
-     * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8
-     * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0
-     * @param[in] output Output tensor info to store the result of matrix multiplication.
-     *                        If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
-#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h
deleted file mode 100644
index 9fb3ce415a..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__
-#define __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply an input vector "A" and a matrix "B". */
-class NEGEMVAArch64Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMVAArch64Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
deleted file mode 100644
index 75c4dbdaa4..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__
-#define __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEHGEMMAArch64FP16Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHGEMMAArch64FP16Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h
new file mode 100644
index 0000000000..646cc7861a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__
+#define __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Utils.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** This class is a wrapper for the assembly kernels.
+  *
+  * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
+  * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
+  * of NEGEMMAssemblyWrapper and other auxiliary data structures to execute a single assembly kernel
+  * in the context of an NEFunctions.
+  *
+  * The type T is the type of the actual kernel implemented in assembly which is of type
+  *         template<typename To, typename Tr> class GemmCommon
+  *
+  *
+  */
+template<typename T>
+class NEGEMMAssemblyWrapper final : public INEKernel
+{
+public:
+    /** Constructor
+     */
+    NEGEMMAssemblyWrapper() : _kernel(nullptr) {}
+
+    NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &) = delete;
+    NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &&) = default;
+    NEGEMMAssemblyWrapper & operator=(NEGEMMAssemblyWrapper &) = delete;
+
+    const char *name() const override
+    {
+        return "NEGEMMAssemblyWrapper";
+    }
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void*>(_kernel)));
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+        auto first = window.x().start();
+        auto last  = window.x().end();
+        _kernel->execute(first, last, info.thread_id);
+    }
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in] kernel      Pointer to an assembly kernel implementation.
+     * @param[in] num_threads Number of concurrent threads which will execute the kernel.
+     */
+    void configure(T *kernel)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void*>(kernel)));
+        _kernel = kernel;
+        auto   win_last = _kernel->get_window_size();
+        Window win;
+        win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+        INEKernel::configure(win);
+    }
+private:
+    T* _kernel;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
index 8ad5b857fb..d6c9931a21 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "a32_interleave_6way_32bit.hpp"
-#include "a32_transpose_interleave_8way_32bit.hpp"
-#include "a64_block16_interleave4_8bit.hpp"
-#include "a64_interleave_8way_16bit.hpp"
-#include "a64_interleave_8way_32bit.hpp"
-#include "a64_interleave_8way_half_to_float.hpp"
-#include "a64_transpose_interleave_12way_16bit.hpp"
-#include "a64_transpose_interleave_12way_half_to_float.hpp"
-#include "a64_transpose_interleave_24way_16bit.hpp"
-#include "transpose_interleave_common.hpp"
+#pragma once
+
+#include <memory>
+
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
+
+namespace arm_gemm {
+
+template<typename Top, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
+
+template<typename Top, typename Tret>
+UniqueGemmCommon<Top, Tret> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB, const Tret alpha, const Tret beta, const int maxthreads, const bool pretransposed_hint);
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
index 29b915a75d..a608566634 100644
--- a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "a32_merge_float_8x6.hpp"
-#include "a64_merge_float_12x8.hpp"
-//#include "a64_merge_float_to_half_12x8.hpp"
-//#include "a64_merge_half_24x8.hpp"
-//#include "a64_merge_int32_12x8.hpp"
+#pragma once
+
+/* This file is used to configure integration-specific aspects of arm_gemm, this is the gemm-linux version */
+
+/* Our CPUInfo is defined in newgemm_lib.hpp */
+#include "newgemm_lib.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
deleted file mode 100644
index fa1d6e37a9..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-// Macro to use in assembler to get a preload.  Needed because of various
-// workarounds needed to get working preload behaviour.
-//
-// Code using these macros needs to clobber x20 and x21 as they might be
-// used by the workaround.
-
-#define ASM_PREFETCH(address)    "PRFM PLDL1KEEP, " address "\n"
-#define ASM_PREFETCHL2(address)  "PRFM PLDL2KEEP, " address "\n"
-#define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
-#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
-
-#else
-
-#define ASM_PREFETCH(address)     "PLD " address "\n"
-#define ASM_PREFETCHW(address)    "PLDW " address "\n"
-
-#endif
-
-/*
- * Do some prefetches.
- */
-template <typename T>
-static inline void prefetch_6x(const T *pfp) {
-    __asm __volatile (
-        ASM_PREFETCH("[%[pfp]]")
-        ASM_PREFETCH("[%[pfp], #64]")
-        ASM_PREFETCH("[%[pfp], #128]")
-        ASM_PREFETCH("[%[pfp], #192]")
-        ASM_PREFETCH("[%[pfp], #256]")
-        ASM_PREFETCH("[%[pfp], #320]")
-    :
-    : [pfp] "r" (pfp)
-    : "memory"
-    );
-}
-
-template <typename T>
-static inline void prefetch_5x(const T *pfp) {
-    __asm __volatile (
-        ASM_PREFETCH("[%[pfp]]")
-        ASM_PREFETCH("[%[pfp], #64]")
-        ASM_PREFETCH("[%[pfp], #128]")
-        ASM_PREFETCH("[%[pfp], #192]")
-        ASM_PREFETCH("[%[pfp], #256]")
-    :
-    : [pfp] "r" (pfp)
-    : "memory"
-    );
-}
-
-template <typename T>
-static inline void prefetch_4x(const T *pfp) {
-    __asm __volatile (
-        ASM_PREFETCH("[%[pfp]]")
-        ASM_PREFETCH("[%[pfp], #64]")
-        ASM_PREFETCH("[%[pfp], #128]")
-        ASM_PREFETCH("[%[pfp], #192]")
-    :
-    : [pfp] "r" (pfp)
-    : "memory"
-    );
-}
-
-template <typename T>
-static inline void prefetch_3x(const T *pfp) {
-    __asm __volatile (
-        ASM_PREFETCH("[%[pfp]]")
-        ASM_PREFETCH("[%[pfp], #64]")
-        ASM_PREFETCH("[%[pfp], #128]")
-    :
-    : [pfp] "r" (pfp)
-    : "memory"
-    );
-}
-
-template <typename T>
-static inline void prefetch_2x(const T *pfp) {
-    __asm __volatile (
-        ASM_PREFETCH("[%[pfp]]")
-        ASM_PREFETCH("[%[pfp], #64]")
-    :
-    : [pfp] "r" (pfp)
-    : "memory"
-    );
-}
-
-template <typename T>
-static inline void prefetch_1x(const T *pfp) {
-    __asm __volatile (
-        ASM_PREFETCH("[%[pfp]]")
-    :
-    : [pfp] "r" (pfp)
-    : "memory"
-    );
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index ef89e3aac3..7f47abcbb9 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,82 @@
  */
 #pragma once
 
-// Abstract class for a GEMM function
+#include <cstddef>
+
+namespace arm_gemm {
+
+// Abstract class for the GEMM/GEMV functions.
+//
+// GEMM implementations may be "native" (never require any input
+// permutation), "pretransposed" (require permutation up-front) or require
+// working space (permute as they go along).  This interface should support
+// all of them.
+
 template<typename To, typename Tr>
 class GemmCommon {
+protected:
+    const To *_Aptr=nullptr;
+    int _lda=0;
+    const To *_Bptr=nullptr;
+    int _ldb=0;
+    Tr *_Cptr=nullptr;
+    int _ldc=0;
+
 public:
-    virtual size_t get_working_size() const = 0;
-    virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space) const = 0;
+    /* Pass in the pointers to the arrays to be operated on and their
+     * strides.  This has a default implementation that just captures them
+     * all in protected members.  If B is pretransposed (see below) then the
+     * settings for B here are ignored.  */
+    virtual void set_arrays(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc) {
+        _Aptr = A;
+        _lda = lda;
+        _Bptr = B;
+        _ldb = ldb;
+        _Cptr = C;
+        _ldc = ldc;
+    }
+
+    /* For threading, we divide the work into some number of units and work
+     * out internally what unit corresponds to what work.  This returns the
+     * total number of units.  */
+    virtual unsigned int get_window_size() const = 0;
+
+    /* The maximum thread count is specified when the GEMM is created.  Some
+     * implementations need to know how many threads will actually run in
+     * order to work properly.
+     *
+     * In some cases, after creating the GEMM the number of threads needs to
+     * be reduced (e.g. not enough work to split across threads).  This
+     * method allows the number of actual threads to be run to be set (must
+     * be equal or lower).
+     *
+     * This has an empty default implementation, as GEMMs which don't care
+     * about thread count can safely ignore this.
+     */
+    virtual void set_nthreads(int nthreads) { };
+
+    /* Actually do the work.  Provide a threadid to index any per-thread
+     * buffers, and a start/end range to indicate which work to do.  */
+    virtual void execute(unsigned int start, unsigned int end, int threadid) = 0;
+
+    /*** Working space interface (optional) ***/
+    /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
+    virtual size_t get_working_size() const { return 0; }
+    /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
+    virtual void set_working_space(void *) { };
+
+    /*** "Pretransposed" interface (optional) ***/
+    /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
+    virtual bool B_is_pretransposed() const { return false; }
+    /* Does pretranspose still need to be done? */
+    virtual bool B_pretranspose_required() const { return false; }
+    /* Total number of bytes of space needed for pretransposed arrays. */
+    virtual size_t get_B_pretransposed_array_size() const { return 0; }
+    /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
+    virtual void pretranspose_B_array(void *buffer, const To *, const int) { };
+
+    // Destructor
     virtual ~GemmCommon() { }
 };
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
deleted file mode 100644
index 659ef837f5..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <stdio.h>
-#include <cassert>
-
-#include "gemm_common.hpp"
-#include "profiler.hpp"
-#include "transform.hpp"
-#include "mergeresults.hpp"
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND	64
-#define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This implementation interleaves the source matrices in blocks - good for
-// larger matrices.
-template<typename strategy, typename To, typename Tr>
-class GemmInterleaved : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    const unsigned int M;
-    const unsigned int N;
-    const unsigned int K;
-
-    const bool trA;
-    const bool trB;
-
-    const strategy strat;
-
-    unsigned int k_block = 0;
-    unsigned int x_block = 0;
-    unsigned int Mround = 0;
-
-    size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * k_block * Mround);
-    }
-
-    size_t get_b_working_size() const {
-        return ROUND_UP(sizeof(Toi) * x_block * k_block);
-    }
-
-    size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
-    }
-
-public:
-    size_t get_working_size() const override {
-        return get_a_working_size() + get_b_working_size() + get_c_working_size();
-    }
-
-    GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
-        const unsigned int L1_size = ci->L1_size;
-        const unsigned int L2_size = ci->L2_size;
-
-        // Work out blocking parameters
-        // k_block: Each iteration will consume (out_width + out_height)
-        // operands - so how many iterations will fill the L1?
-        k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));
-
-        // Needs to be a multiple of the K unroll level.
-        k_block /= strat.k_unroll;
-        k_block *= strat.k_unroll;
-
-        // Now tune to presented problem size; this is how many blocks we need.
-        int num_k_blocks = (K + (k_block - 1)) / k_block;
-
-        // So divide the space equally into that many blocks.
-        k_block = (K + num_k_blocks - 1) / num_k_blocks;
-
-        // And round UP to the K unroll level required.
-        k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
-        k_block *= strat.k_unroll;
-
-        // x_block: Work out how many rows (of length k_block) will fit in the L2
-        x_block = L2_size / (sizeof(Toi) * k_block);
-
-        // Needs to be a multiple of the kernel output width.
-        x_block /= strat.out_width;
-        x_block *= strat.out_width;
-
-        // And tune to the presented problem size.
-        int num_x_blocks = (N + (x_block - 1)) / x_block;
-        x_block = (N + num_x_blocks - 1) / num_x_blocks;
-
-        x_block = (x_block + strat.out_width - 1) / strat.out_width;
-        x_block *= strat.out_width;
-
-        // Work out the rounded size of M - needed for some buffers.
-        Mround = (M + (strat.out_height - 1)) / strat.out_height;
-        Mround *= strat.out_height;
-
-    }
-
-    // Actually execute the GEMM.
-    void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
-        assert(working_space);
-        profiler prof;
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
-        intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
-        size_t diff = 0;
-
-        if (working_space_int & 0xF) {
-            diff = 0x10 - (working_space_int & 0xF);
-        }
-
-        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
-        Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
-        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
-
-        for (unsigned int k0=0; k0<K; k0 += k_block) {
-            unsigned int kmax = k0 + k_block;
-            if (kmax > K) kmax = K;
-
-            // Figure out how many "K" the kernel will actually process.
-            int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
-            kern_k *= strat.k_unroll;
-
-            prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) {
-                if (trA ^ strategy::A_transpose) {
-                    Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
-                } else {
-                    Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
-                }
-            });
-
-            for (unsigned int x0=0; x0<N; x0 += x_block) {
-                unsigned int xmax = x0 + x_block;
-                if (xmax > N) xmax = N;
-
-                int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
-
-                prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) {
-                    if (trB ^ strategy::B_transpose) {
-                        Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
-                    } else {
-                        Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
-                    }
-                });
-
-                for (unsigned int y=0; y<M; y+=strat.out_height) {
-                    unsigned int ymax = y + strat.out_height;
-                    if (ymax > M) ymax = M;
-
-                    prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
-                    prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
-                }
-            }
-        }
-    }
-};
diff --git a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp
deleted file mode 100644
index 098fdaa7ac..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <stdio.h>
-
-#include "gemm_common.hpp"
-
-#include "profiler.hpp"
-#include "transform.hpp"
-#include "mergeresults.hpp"
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND	64
-#define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This is implementation is for GEMV with a transposed matrix.
-//
-// By default the source data is used in-place, but if type conversion is
-// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
-
-template<typename strategy, typename To, typename Tr>
-class GemvTransposed : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    const unsigned int N;
-    const unsigned int K;
-
-    const strategy strat;
-
-    unsigned int m_block;
-    unsigned int n_block;
-
-    size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * m_block);
-    }
-
-    size_t get_b_working_size() const {
-        return ROUND_UP(sizeof(Toi) * m_block * n_block);
-    }
-
-    size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * n_block);
-    }
-
-public:
-    size_t get_working_size() const override {
-        return get_a_working_size() + get_b_working_size() + get_c_working_size();
-    }
-
-    GemvTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K) : N(N), K(K), strat(ci) {
-        /* For now don't do any blocking. TODO: figure out if we should. */
-        m_block = K;
-        n_block = N;
-    }
-
-    // Actually execute the GEMV.
-    void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
-        profiler prof;
-
-        static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
-        static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
-
-        for (unsigned int m0=0; m0<K; m0+=m_block) {
-            unsigned int mmax = m0 + m_block;
-            if (mmax > K) mmax = K;
-
-            for (unsigned int n0=0; n0<N; n0+=n_block) {
-                unsigned int nmax = n0 + n_block;
-                if (nmax > N) nmax = N;
-
-                prof(PROFILE_KERNEL, ((mmax-m0) * (nmax-n0)), [&](void) { strat.kernel(B + (m0 * ldb) + n0, A + m0, C + n0, alpha, ldb, (mmax-m0), (nmax-n0)); });
-            }
-        }
-    }
-};
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
deleted file mode 100644
index d78d33c647..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-// Actual kernel implementations
-#include "a32_sgemm_8x6/a53.hpp"
-#include "a32_sgemm_8x6/a55r1.hpp"
-#include "a32_sgemm_8x6/generic.hpp"
-
-// 8x6 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class sgemm_8x6 {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 6;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 8;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 8;
-    static const int out_height = 6;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    sgemm_8x6(const CPUInfo *ci) {
-        switch(ci->CPU) {
-            case CPUTarget::A53:
-                kernel = a32_sgemm_8x6_a53;
-                break;
-
-            case CPUTarget::A55_DOT:
-                kernel = a32_sgemm_8x6_a55r1;
-                break;
-
-            default:
-                kernel = a32_sgemm_8x6;
-                break;
-        }
-    }
-};
-
-#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp
deleted file mode 100644
index 6bfbfc8742..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    printf("CIAO SONO IO, AMORE MIO!\n");
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int tails = (K & 3);
-            if (tails == 0) {
-                tails = 4;
-            }
-            int k = ((K+3)/4) - 1;
-
-            __asm __volatile (
-                "vmov.i32	q4, #0\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]\n"
-                "vmov.i32	q5, #0\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]\n"
-                "vmov.i32	q6, #0\n"
-                "ldr		r0, [%[a_ptr], #0x10]\n"
-                "vmov.i32	q7, #0\n"
-                "ldr		r1, [%[a_ptr], #0x14]\n"
-                "vmov.i32	q8, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x40]")
-                "vmov.i32	q9, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x40]")
-                "vmov.i32	q10, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x80]")
-                "vmov.i32	q11, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x80]")
-                "vmov.i32	q12, #0\n"
-                "vmov.i32	q13, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0xC0]")
-                "vmov.i32	q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0XC0]")
-                "vmov.i32	q15, #0\n"
-                "cmp		%[k], #0\n"
-                "beq		6f\n"
-
-                "1:\n"
-                // Unroll 0
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmov		d2, r0, r1\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "ldr		r0, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "ldr		r1, [%[b_ptr], #0x1C]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-
-                "vldr		d3, [%[a_ptr], #0x18]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x100]")
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x20]\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "ldr		r0, [%[b_ptr], #0x28]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "ldr		r1, [%[b_ptr], #0x2C]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-
-                "vldr		d0, [%[a_ptr], #0x20]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "ldr		r0, [%[a_ptr], #0x28]\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "ldr		r1, [%[a_ptr], #0x2C]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-
-                // Unroll 1
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-                "vmov		d1, r0, r1\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "ldr		r0, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "ldr		r1, [%[b_ptr], #0x3C]\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-
-                "vldr		d2, [%[a_ptr], #0x30]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0x100]")
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "ldr		r0, [%[b_ptr], #0x48]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "ldr		r1, [%[b_ptr], #0x4C]\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-
-                "vldr		d3, [%[a_ptr], #0x38]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "ldr		r0, [%[a_ptr], #0x40]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "ldr		r1, [%[a_ptr], #0x44]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-
-                // Unroll 2
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-                "vmov		d0, r0, r1\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "ldr		r0, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "ldr		r1, [%[b_ptr], #0x5C]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-
-                "vldr		d1, [%[a_ptr], #0x48]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x140]")
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "ldr		r0, [%[b_ptr], #0x68]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "ldr		r1, [%[b_ptr], #0x6C]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-
-                "vldr		d2, [%[a_ptr], #0x50]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "ldr		r0, [%[a_ptr], #0x58]\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "ldr		r1, [%[a_ptr], #0x5C]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "add		%[a_ptr], %[a_ptr], #0x60\n"
-
-                // Unroll 3
-                "vldr		d6, [%[b_ptr], #0x70]\n"
-                "vmov		d3, r0, r1\n"
-                "vmla.f32	q4, q2, d1[0]\n"
-                "ldr		r0, [%[b_ptr], #0x78]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "ldr		r1, [%[b_ptr], #0x7C]\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "add		%[b_ptr], %[b_ptr], #0x80\n"
-
-                "vldr		d0, [%[a_ptr], #0x00]\n"
-                "vmov		d7, r0, r1\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0xC0]")
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-
-                "vldr		d4, [%[b_ptr], #0x00]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "ldr		r0, [%[b_ptr], #0x08]\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "ldr		r1, [%[b_ptr], #0x0C]\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "subs		%[k], %[k], #1\n"
-
-                "vldr		d1, [%[a_ptr], #0x08]\n"
-                "vmov		d5, r0, r1\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "ldr		r0, [%[a_ptr], #0x10]\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "ldr		r1, [%[a_ptr], #0x14]\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "bne		1b\n"
-
-                // "Tails" shows how many multiply blocks are needed at the
-                // end, must be 1-4 inclusive.  Bail out to alternative tail
-                // immediately if it's 1.
-                "6:\n"
-                "subs		%[tails], %[tails], #1\n"
-                "beq		3f\n"
-
-                // Detached final iteration - for now adapt the generic
-                // tails rather than reimplementing for A53.
-
-                // Unroll 0
-                "vmov		d2, r0, r1\n"
-                "add		%[a_ptr], %[a_ptr], #0x18\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "add		%[b_ptr], %[b_ptr], #0x10\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		4f\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		5f\n"
-
-                // Unroll 2
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==1 final tail
-                "3:\n"
-                "vmov		d2, r0, r1\n"
-                "add		%[b_ptr], %[b_ptr], #0x10\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "add		%[a_ptr], %[a_ptr], #0x18\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==2 final tail
-                "4:\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==3 final tail
-                "5:\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vld1.32	{d0}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-
-                "2:\n"
-                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
-            );
-        }
-    }
-}
-
-#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp
deleted file mode 100644
index 4f0ef7cd21..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    /* Work out starting values for "k" and "tails" in the inner loop. */
-    int tails_initial = (K & 3);
-    if (tails_initial == 0) {
-        tails_initial = 4;
-    }
-
-    int k_initial = ((K+3)/4) - 1;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            int tails = tails_initial;
-            int k = k_initial;
-
-            a_ptr = a_ptr0;
-
-            __asm __volatile (
-                "vldr		d0, [%[a_ptr]]\n"
-                "vmov.i32	q4, #0\n"
-                "vldr		d1, [%[a_ptr], #0x08]\n"
-                "vmov.i32	q5, #0\n"
-                "vldr		d4, [%[b_ptr]]\n"
-                "vmov.i32	q6, #0\n"
-                "vldr		d5, [%[b_ptr], #0x08]\n"
-                "vmov.i32	q7, #0\n"
-                "vldr		d2, [%[a_ptr], #0x10]\n"
-                "vmov.i32	q8, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x40]")
-                "vmov.i32	q9, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x40]")
-                "vmov.i32	q10, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x80]")
-                "vmov.i32	q11, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x80]")
-                "vmov.i32	q12, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0XC0]")
-                "vmov.i32	q13, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0xC0]")
-                "vmov.i32	q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x100]")
-                "vmov.i32	q15, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x100]")
-                "cmp		%[k], #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x140]")
-                "beq		6f\n"
-                ASM_PREFETCH("[%[b_ptr], #0x180]")
-
-                "1:\n"
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vldr		d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vldr		d3, [%[a_ptr], #0x18]\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x140]")
-                "vmla.f32	q8, q2, d2[0]\n"
-                "subs		%[k], %[k], #1\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vldr		d4, [%[b_ptr], #0x20]\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vldr		d5, [%[b_ptr], #0x28]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x20]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vldr		d1, [%[a_ptr], #0x28]\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vldr		d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vldr		d2, [%[a_ptr], #0x30]\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-
-                "vmla.f32	q7, q2, d0[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0x1C0]")
-                "vmla.f32	q8, q2, d1[0]\n"
-
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vldr		d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vldr		d5, [%[b_ptr], #0x48]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vldr		d3, [%[a_ptr], #0x38]\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x40]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-
-                // Unroll 2
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vldr		d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vldr		d1, [%[a_ptr], #0x48]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-
-                "vmla.f32	q7, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #0x180]")
-                "vmla.f32	q8, q2, d0[0]\n"
-
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vldr		d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vldr		d5, [%[b_ptr], #0x68]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vldr		d2, [%[a_ptr], #0x50]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vldr		d3, [%[a_ptr], #0x58]\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "add		%[a_ptr], %[a_ptr], #0x60\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vldr		d6, [%[b_ptr], #0x70]\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vldr		d7, [%[b_ptr], #0x78]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "add		%[b_ptr], %[b_ptr], #0x80\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vldr		d0, [%[a_ptr], #0x00]\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #0x180]")
-                "vmla.f32	q8, q2, d3[0]\n"
-
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vldr		d4, [%[b_ptr], #0x00]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vldr		d5, [%[b_ptr], #0x08]\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vldr		d1, [%[a_ptr], #0x08]\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vldr		d2, [%[a_ptr], #0x10]\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-
-                "vmla.f32	q15, q3, d3[1]\n"
-                "bne		1b\n"
-
-                // "Tails" shows how many multiply blocks are needed at the
-                // end, must be 1-4 inclusive.  Bail out to alternative tail
-                // immediately if it's 1.
-                "6:\n"
-                "subs		%[tails], %[tails], #1\n"
-                "beq		3f\n"
-
-                // Detached final iteration
-
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vldr		d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vldr		d3, [%[a_ptr], #0x18]\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vldr		d4, [%[b_ptr], #0x20]\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vldr		d5, [%[b_ptr], #0x28]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x20]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "add		%[b_ptr], %[b_ptr], #0x30\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vldr		d1, [%[a_ptr], #0x28]\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "beq		4f\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vldr		d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vldr		d2, [%[a_ptr], #0x30]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-
-                "vmla.f32	q9, q2, d1[1]\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vldr		d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vldr		d5, [%[b_ptr], #0x48]\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vldr		d3, [%[a_ptr], #0x38]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vldr		d0, [%[a_ptr], #0x40]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "beq		5f\n"
-
-                // Unroll 2
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vldr		d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vldr		d1, [%[a_ptr], #0x48]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vldr		d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vldr		d5, [%[b_ptr], #0x68]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vldr		d2, [%[a_ptr], #0x50]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vldr		d3, [%[a_ptr], #0x58]\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vldr		d6, [%[b_ptr], #0x70]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vldr		d7, [%[b_ptr], #0x78]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x60\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x80\n"
-                "b		2f\n"
-
-                // tails==1 final tail
-                "3:\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vldr		d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vldr		d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x18\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x20\n"
-                "b		2f\n"
-
-                // tails==2 final tail
-                "4:\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vldr		d6, [%[b_ptr], #0x30]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vldr		d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x40\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x30\n"
-                "b		2f\n"
-
-                // tails==3 final tail
-                "5:\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vldr		d6, [%[b_ptr], #0x50]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vldr		d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "add		%[a_ptr], %[a_ptr], #0x48\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "add		%[b_ptr], %[b_ptr], #0x60\n"
-
-                "2:\n"
-                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
-            );
-        }
-    }
-}
-
-#endif /* __arm__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
deleted file mode 100644
index 7a44fed5b2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "../../asmlib.hpp"
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int tails = (K & 3);
-            if (tails == 0) {
-                tails = 4;
-            }
-            int k = ((K+3)/4) - 1;
-
-            __asm __volatile (
-                "vmov.i32	q4, #0\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmov.i32	q5, #0\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-                "vmov.i32	q6, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #48]")
-                "vmov.i32	q7, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #48]")
-                "vmov.i32	q8, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #112]")
-                "vmov.i32	q9, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #112]")
-                "vmov.i32	q10, #0\n"
-                "vmov.i32	q11, #0\n"
-                "vmov.i32	q12, #0\n"
-                "vmov.i32	q13, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #176]")
-                "vmov.i32	q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #176]")
-                "vmov.i32	q15, #0\n"
-
-                "cmp		%[k], #0\n"
-                "beq		6f\n"
-
-                "1:\n"
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "subs		%[k], %[k], #1\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #208]")
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-
-                // Unroll 2
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #240]")
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #208]")
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "bne		1b\n"
-
-                // Branch here if we never execute main loop.
-                "6:\n"
-
-                // "Tails" shows how many multiply blocks are needed at the
-                // end, must be 1-4 inclusive.  Bail out to alternative tail
-                // immediately if it's 1.
-                "subs		%[tails], %[tails], #1\n"
-                "beq		3f\n"
-
-                // Detached final iteration
-                // Unroll 0
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		4f\n"
-
-                // Unroll 1
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "subs		%[tails], %[tails], #1\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "beq		5f\n"
-
-                // Unroll 2
-                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
-
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-
-                // Unroll 3
-                "vmla.f32	q4, q2, d1[0]\n"
-                "vmla.f32	q10, q3, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d1[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d1[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d2[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d2[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d2[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d2[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d3[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d3[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d3[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d3[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==1 final tail
-                "3:\n"
-                "vmla.f32	q4, q2, d0[0]\n"
-                "vld1.32	{d2}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d0[1]\n"
-                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d1[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d0[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d0[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d1[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d1[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d1[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d2[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d2[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d2[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d2[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==2 final tail
-                "4:\n"
-                "vmla.f32	q4, q2, d3[0]\n"
-                "vmla.f32	q10, q3, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q5, q2, d3[1]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d3[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q6, q2, d0[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d0[0]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d0[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d0[1]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d1[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d1[0]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d1[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d1[1]\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-                "b		2f\n"
-
-                // tails==3 final tail
-                "5:\n"
-                "vmla.f32	q4, q2, d2[0]\n"
-                "vld1.32	{d0}, [%[a_ptr] :64]!\n"
-                "vmla.f32	q5, q2, d2[1]\n"
-                "vmla.f32	q6, q2, d3[0]\n"
-                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q10, q3, d2[0]\n"
-                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q11, q3, d2[1]\n"
-                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q12, q3, d3[0]\n"
-                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q7, q2, d3[1]\n"
-                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q13, q3, d3[1]\n"
-                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q8, q2, d0[0]\n"
-                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q14, q3, d0[0]\n"
-                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q9, q2, d0[1]\n"
-                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32	q15, q3, d0[1]\n"
-                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
-
-                "2:\n"
-                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
deleted file mode 100644
index f7659b9a67..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "a64_gemm_s16_12x8/generic.hpp"
-
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_s16_12x8 {
-public:
-    typedef int16_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    gemm_s16_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_s16_asimd_12x8;
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
deleted file mode 100644
index 10259b2fdf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <arm_neon.h>
-
-inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
-{
-  const int16_t *a_ptr = Apanel;
-  int32_t *c_ptr = Cpanel;
-  for (int yb = 0; yb < ablocks; yb++)
-  {
-    const int16_t *a_ptr0 = a_ptr;
-    const int16_t *b_ptr = Bpanel;
-
-    for (int xb = 0; xb < bblocks; xb++)
-    {
-      a_ptr = a_ptr0;
-      const bool odd_k = K & 0x1;
-      int k = (K+1)/2 - 1;
-
-      register int16x8_t aa asm("v0");
-      register int16x8_t ab asm("v1");
-      register int16x8_t b0 asm("v2");
-      register int16x8_t b1 asm("v3");
-      register int16x8_t b2 asm("v4");
-
-      __asm __volatile (
-        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
-        "movi v5.4s, #0\n"
-        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
-        "movi v6.4s, #0\n"
-        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
-        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
-        "movi v7.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #64]")
-        "movi v8.4s, #0\n"
-        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
-        "movi v9.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #64]")
-        "movi v10.4s, #0\n"
-        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
-        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
-        "movi v11.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #96]")
-        "movi v12.4s, #0\n"
-        "movi v13.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #96]")
-        "movi v14.4s, #0\n"
-        "movi v15.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #128]")
-        "movi v16.4s, #0\n"
-        "movi v17.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #128]")
-        "movi v18.4s, #0\n"
-        "movi v19.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #160]")
-        "movi v20.4s, #0\n"
-        "movi v21.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #160]")
-        "movi v22.4s, #0\n"
-        "movi v23.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #192]")
-        "movi v24.4s, #0\n"
-        "add %x[a_ptr], %x[a_ptr], #0x10\n"
-        "movi v25.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #192]")
-        "movi v26.4s, #0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x18\n"
-        "movi v27.4s, #0\n"
-        "movi v28.4s, #0\n"
-
-        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
-
-        "1:\n"  // Main loop
-          // First unroll
-          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
-          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          // Second unroll
-          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
-          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
-          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
-          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "add %x[a_ptr], %x[a_ptr], #0x20\n"
-          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          ASM_PREFETCH("[%[b_ptr], #320]")
-          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          ASM_PREFETCH("[%[a_ptr], #320]")
-          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          ASM_PREFETCH("[%[b_ptr], #448]")
-          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "subs %x[k], %x[k], #0x1\n"
-          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
-          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
-          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "add %x[b_ptr], %x[b_ptr], #0x30\n"
-          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "bne 1b\n"
-
-        "2:\n"  // Even tail
-          "cbnz %x[odd_k], 3f\n"
-
-          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "add %[a_ptr], %[a_ptr], #0x10\n"
-          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "add %[b_ptr], %[b_ptr], #0x18\n"
-          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "b 4f\n"  // Complete write out
-
-        "3:\n"  // Odd tail
-          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-
-        "4:\n"  // End of function
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "str q27, [%x[c_ptr], #0x140]\n"
-          "str q12, [%x[c_ptr], #0x150]\n"
-          "str q20, [%x[c_ptr], #0x160]\n"
-          "str q28, [%x[c_ptr], #0x170]\n"
-          "add %x[c_ptr], %x[c_ptr], #0x180\n"
-        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
-          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
-        : [odd_k] "r" (odd_k)
-        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
-      );
-    }
-  }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
deleted file mode 100644
index 88cbb361b3..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_s8_12x8/generic.hpp"
-
-class gemm_s8_12x8 {
-public:
-    typedef int8_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 4;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 4;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 4;
-
-    kern_type kernel = nullptr;
-
-    gemm_s8_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_s8_12x8;
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
deleted file mode 100644
index 4ac2ba4234..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
-    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
-    const int W = K/4;
-    // Fix up for odd lengths - set a flag if K is odd, but make.
-    // sure we round up the iteration count.
-    const int oddk = (W & 1);
-    const int init_value_k = ((W+1)/2) - 1;
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int k = init_value_k;
-            register int32x4_t a0  asm("v0");
-            register int32x4_t a1  asm("v1");
-            register int32x4_t b0  asm("v2");
-            register int32x4_t b1  asm("v3");
-            register int32x4_t b2  asm("v4");
-            register int32x4_t a0a asm("v5");
-            register int32x4_t a1a asm("v6");
-            __asm __volatile (
-                _DECLARE_SDOT
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "movi	v11.4s, #0x0\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-
-                // Loop proper
-                "1:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "ins    %[a0].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "ins    %[a1].d[1], x20\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "ins    %[b1].d[1], x20\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "ins    %[b2].d[1], x20\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "ins    %[b2].d[1], x20\n"
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-
-
-                ".purgem sdot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-
-
-        }
-    }
-}
-
-#endif
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
deleted file mode 100644
index 1d6fd1623e..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_SDOT ".altmacro\n"\
-    ".macro sdot opd:req, opn:req, opm:req\n"\
-    "local vd, vn, vm, h, l\n"\
-    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
-    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
-    ".set vd,\\reg\n"\
-    ".endif\n"\
-    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
-    ".set vn,\\reg\n"\
-    ".endif\n"\
-    ".irp idx,0,1,2,3\n"\
-    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
-    ".set vm,\\reg\n"\
-    ".set h,\\idx / 2\n"\
-    ".set l,\\idx %% 2\n"\
-    ".endif\n"\
-    ".endr\n"\
-    ".endr\n"\
-    ".ifndef vd\n"\
-    ".error \"Bad operand \\opd\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vn\n"\
-    ".error \"Bad operand \\opn\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vm\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef h\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef l\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".int	 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
-    ".endm\n"\
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
deleted file mode 100644
index bfad0373b2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-
-inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    K/=4;
-    const long int row_jump=0;
-    const long int block_jump=0;
-    const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel);
-    int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel);
-    for (int yb=0; yb<ablocks; yb++) {
-        const int32_t *a_ptr0 = a_ptr;
-        const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel);
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-            register int32x4_t a0  asm("v0");
-            register int32x4_t a1  asm("v1");
-            register int32x4_t b0  asm("v2");
-            register int32x4_t b1  asm("v3");
-            register int32x4_t b2  asm("v4");
-            register int32x4_t a0a asm("v5");
-            register int32x4_t a1a asm("v6");
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                _DECLARE_SDOT
-
-                // Loop proper
-                "1:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr	%q[a0], [%[a_ptr], #64]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ldr	%q[a1], [%[a_ptr], #80]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #112]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-
-                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
-                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-
-                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-                ".purgem sdot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-
-
-}
-
-
-#endif 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
deleted file mode 100644
index 1588f049f4..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_s8_4x4/generic.hpp"
-
-class gemm_s8_4x4 {
-public:
-    typedef int8_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 4;
-    static const int A_block = 16;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 4;
-    static const int B_block = 16;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 4;
-    static const int out_height = 4;
-    static const int k_unroll = 16;
-
-    kern_type kernel = nullptr;
-
-    gemm_s8_4x4(const CPUInfo *ci) {
-        kernel = a64_gemm_s8_4x4;
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
deleted file mode 100644
index 0ec435b33b..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
+++ /dev/null
@@ -1,465 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
-    K /= 16;
-    int oddk = (K & 1);
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-
-            int k = ((K+1)/2)-1;
-
-            register int8x16_t b0  asm("v4");
-            register int8x16_t b1  asm("v5");
-            register int8x16_t b2  asm("v6");
-            register int8x16_t b3  asm("v7");
-            register int8x16_t b0a asm("v8");
-            register int8x16_t b1a asm("v9");
-            register int8x16_t b2a asm("v10");
-            register int8x16_t b3a asm("v11");
-
-            __asm __volatile (
-                "movi	v16.4s, #0x0\n"
-                "ldr	q0, [%[a_ptr]]\n"
-                "movi	v17.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v18.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v19.4s, #0x0\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "movi	v20.4s, #0x0\n"
-                "ldr	%q[b3], [%[b_ptr], #48]\n"
-                "movi	v21.4s, #0x0\n"
-                "ldr	q1, [%[a_ptr], #16]\n"
-                "movi	v22.4s, #0x0\n"
-                "ldr	q2, [%[a_ptr], #32]\n"
-                "movi	v23.4s, #0x0\n"
-                "ldr	q3, [%[a_ptr], #48]\n"
-                "movi	v24.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v25.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v26.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v27.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v28.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v29.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v30.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v31.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-
-                // Loop structure optimized for A57 (after r0).
-
-                // Unavoidably, the multiply will "dribble" if
-                // dual issued with an add.
-
-                // Minimize the effect of this by making sure
-                // there are 2 adds to run under the dribbled
-                // multiply.
-
-                // Pipeline in blocks of 8 multiplies - combine
-                // this iteration's multiplies with adds from
-                // the previous iteration.
-
-                // So the first block doesn't have any adds to
-                // do - but because all the adds are at the
-                // start of the block it's only the first couple
-                // of multiplies that need to be pulled out.
-
-                // Start of unroll 0 (first iteration)
-                "smull	v12.8h, v0.8b, %[b0].8b\n"
-                "smull	v13.8h, v0.8b, %[b1].8b\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                // Unroll 0 continuation (branch target)
-                "1:\n"
-                "smull	v14.8h, v0.8b, %[b2].8b\n"
-                "subs	%w[k], %w[k], #1\n"
-                "smull	v15.8h, v0.8b, %[b3].8b\n"
-                "ldr	%q[b0a], [%[b_ptr], #64]\n"
-                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
-                "ldr	%q[b1a], [%[b_ptr], #80]\n"
-                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
-                "ldr 	q0, [%[a_ptr], #64]\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2].8b\n"
-                "ldr	%q[b2a], [%[b_ptr], #96]\n"
-                "smull	v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
-                "ldr	%q[b3a], [%[b_ptr], #112]\n"
-                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
-                "add	%[b_ptr], %[b_ptr], #128\n"
-                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
-                "ldr 	q1, [%[a_ptr], #80]\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "smull	v14.8h, v2.8b, %[b2].8b\n"
-                "smull	v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
-                "ldr 	q2, [%[a_ptr], #96]\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "smull	v14.8h, v3.8b, %[b2].8b\n"
-                "smull	v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
-                "ldr 	%q[b0], [%[b_ptr], #0]\n"
-                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
-                "ldr 	q3, [%[a_ptr], #112]\n"
-
-                // Unroll 1
-                "sadalp	v28.4s, v12.8h\n"
-                "smull	v12.8h, v0.8b, %[b0a].8b\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "smull	v13.8h, v0.8b, %[b1a].8b\n"
-                "sadalp	v31.4s, v15.8h\n"
-                "smull	v14.8h, v0.8b, %[b2a].8b\n"
-                "smull	v15.8h, v0.8b, %[b3a].8b\n"
-                "ldr 	%q[b1], [%[b_ptr], #16]\n"
-                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
-                "ldr 	%q[b2], [%[b_ptr], #32]\n"
-                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
-                "ldr 	q0, [%[a_ptr], #128]\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0a].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1a].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "add	%[a_ptr], %[a_ptr], #128\n"
-                "smull	v14.8h, v1.8b, %[b2a].8b\n"
-                "smull	v15.8h, v1.8b, %[b3a].8b\n"
-                "ldr 	%q[b3], [%[b_ptr], #48]\n"
-                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
-                "ldr 	q1, [%[a_ptr], #16]\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0a].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1a].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "smull	v14.8h, v2.8b, %[b2a].8b\n"
-                "smull	v15.8h, v2.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
-                "ldr 	q2, [%[a_ptr], #32]\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0a].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1a].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "smull	v14.8h, v3.8b, %[b2a].8b\n"
-                "smull	v15.8h, v3.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
-                "ldr 	q3, [%[a_ptr], #48]\n"
-
-                // Start of unroll 0 for next iteration.
-                "sadalp	v28.4s, v12.8h\n"
-                "smull	v12.8h, v0.8b, %[b0].8b\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "smull	v13.8h, v0.8b, %[b1].8b\n"
-                "sadalp	v31.4s, v15.8h\n"
-                "bne	1b\n"
-
-                // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "smull	v14.8h, v0.8b, %[b2].8b\n"
-                "smull	v15.8h, v0.8b, %[b3].8b\n"
-                "ldr	%q[b0a], [%[b_ptr], #64]\n"
-                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
-                "ldr	%q[b1a], [%[b_ptr], #80]\n"
-                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
-                "ldr 	q0, [%[a_ptr], #64]\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2].8b\n"
-                "ldr	%q[b2a], [%[b_ptr], #96]\n"
-                "smull	v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
-                "ldr	%q[b3a], [%[b_ptr], #112]\n"
-                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
-                "add	%[b_ptr], %[b_ptr], #128\n"
-                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
-                "ldr 	q1, [%[a_ptr], #80]\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "smull	v14.8h, v2.8b, %[b2].8b\n"
-                "smull	v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
-                "ldr 	q2, [%[a_ptr], #96]\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "smull	v14.8h, v3.8b, %[b2].8b\n"
-                "smull	v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
-                "ldr 	q3, [%[a_ptr], #112]\n"
-
-                // Unroll 1
-                "sadalp	v28.4s, v12.8h\n"
-                "smull	v12.8h, v0.8b, %[b0a].8b\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "smull	v13.8h, v0.8b, %[b1a].8b\n"
-                "sadalp	v31.4s, v15.8h\n"
-                "smull	v14.8h, v0.8b, %[b2a].8b\n"
-                "add	%[a_ptr], %[a_ptr], #128\n"
-                "smull	v15.8h, v0.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0a].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1a].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2a].8b\n"
-                "smull	v15.8h, v1.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0a].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1a].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smull	v14.8h, v2.8b, %[b2a].8b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "smull	v15.8h, v2.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
-                "str	q16, [%[c_ptr]]\n"
-                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
-                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0a].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1a].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smull	v14.8h, v3.8b, %[b2a].8b\n"
-                "addp	v20.4s, v24.4s, v25.4s\n"
-                "addp	v21.4s, v26.4s, v27.4s\n"
-                "smull	v15.8h, v3.8b, %[b3a].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
-                "str	q17, [%[c_ptr], #16]\n"
-                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "smull	v14.8h, v0.8b, %[b2].8b\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "smull	v15.8h, v0.8b, %[b3].8b\n"
-                "add	%[b_ptr], %[b_ptr], #64\n"
-                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
-
-                "sadalp	v16.4s, v12.8h\n"
-                "smull	v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp	v17.4s, v13.8h\n"
-                "sadalp	v18.4s, v14.8h\n"
-                "smull	v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp	v19.4s, v15.8h\n"
-                "smull	v14.8h, v1.8b, %[b2].8b\n"
-                "smull	v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
-
-                "sadalp	v20.4s, v12.8h\n"
-                "smull	v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp	v21.4s, v13.8h\n"
-                "sadalp	v22.4s, v14.8h\n"
-                "smull	v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp	v23.4s, v15.8h\n"
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "smull	v14.8h, v2.8b, %[b2].8b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "smull	v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
-                "str	q16, [%[c_ptr]]\n"
-                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
-                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
-
-                "sadalp	v24.4s, v12.8h\n"
-                "smull	v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp	v25.4s, v13.8h\n"
-                "sadalp	v26.4s, v14.8h\n"
-                "smull	v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp	v27.4s, v15.8h\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "smull	v14.8h, v3.8b, %[b2].8b\n"
-                "addp	v20.4s, v24.4s, v25.4s\n"
-                "addp	v21.4s, v26.4s, v27.4s\n"
-                "smull	v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
-                "str	q17, [%[c_ptr], #16]\n"
-                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
-
-                "3:\n"
-
-                // Final additions
-                "sadalp	v28.4s, v12.8h\n"
-                "str	q18, [%[c_ptr], #32]\n"
-                "sadalp	v29.4s, v13.8h\n"
-                "sadalp	v30.4s, v14.8h\n"
-                "sadalp	v31.4s, v15.8h\n"
-
-                // Horizontal reduction, phase 1
-                "addp	v22.4s, v28.4s, v29.4s\n"
-                "addp	v23.4s, v30.4s, v31.4s\n"
-
-                // Horizontal reduction, phase 2
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "str	q19, [%[c_ptr], #48]\n"
-                "add	%[c_ptr], %[c_ptr], #64\n"
-
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
-              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
-              [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
-              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
deleted file mode 100644
index 7eb8b2dacf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "a64_gemm_u16_12x8/generic.hpp"
-
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_u16_12x8 {
-public:
-    typedef uint16_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    gemm_u16_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_u16_asimd_12x8;
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
deleted file mode 100644
index b3f310ce62..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <arm_neon.h>
-
-inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
-{
-  const uint16_t *a_ptr = Apanel;
-  uint32_t *c_ptr = Cpanel;
-
-  for (int yb = 0; yb < ablocks; yb++)
-  {
-    const uint16_t *a_ptr0 = a_ptr;
-    const uint16_t *b_ptr = Bpanel;
-
-    for (int xb = 0; xb < bblocks; xb++)
-    {
-      a_ptr = a_ptr0;
-      const bool odd_k = K & 0x1;
-      int k = (K+1)/2 - 1;
-
-      register uint16x8_t aa asm("v0");
-      register uint16x8_t ab asm("v1");
-      register uint16x8_t b0 asm("v2");
-      register uint16x8_t b1 asm("v3");
-      register uint16x8_t b2 asm("v4");
-
-      __asm __volatile (
-        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
-        "movi v5.4s, #0\n"
-        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
-        "movi v6.4s, #0\n"
-        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
-        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
-        "movi v7.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #64]")
-        "movi v8.4s, #0\n"
-        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
-        "movi v9.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #64]")
-        "movi v10.4s, #0\n"
-        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
-        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
-        "movi v11.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #96]")
-        "movi v12.4s, #0\n"
-        "movi v13.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #96]")
-        "movi v14.4s, #0\n"
-        "movi v15.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #128]")
-        "movi v16.4s, #0\n"
-        "movi v17.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #128]")
-        "movi v18.4s, #0\n"
-        "movi v19.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #160]")
-        "movi v20.4s, #0\n"
-        "movi v21.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #160]")
-        "movi v22.4s, #0\n"
-        "movi v23.4s, #0\n"
-        ASM_PREFETCH("[%[a_ptr], #192]")
-        "movi v24.4s, #0\n"
-        "add %x[a_ptr], %x[a_ptr], #0x10\n"
-        "movi v25.4s, #0\n"
-        ASM_PREFETCH("[%[b_ptr], #192]")
-        "movi v26.4s, #0\n"
-        "add %x[b_ptr], %x[b_ptr], #0x18\n"
-        "movi v27.4s, #0\n"
-        "movi v28.4s, #0\n"
-
-        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
-
-        "1:\n"  // Main loop
-          // First unroll
-          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
-          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          // Second unroll
-          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
-          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
-          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
-          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "add %x[a_ptr], %x[a_ptr], #0x20\n"
-          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          ASM_PREFETCH("[%[b_ptr], #320]")
-          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          ASM_PREFETCH("[%[a_ptr], #320]")
-          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          ASM_PREFETCH("[%[b_ptr], #448]")
-          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "subs %x[k], %x[k], #0x1\n"
-          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
-          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
-          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "add %x[b_ptr], %x[b_ptr], #0x30\n"
-          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "bne 1b\n"
-
-        "2:\n"  // Even tail
-          "cbnz %x[odd_k], 3f\n"
-
-          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
-          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
-          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
-          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
-          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
-          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
-          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
-          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "add %[a_ptr], %[a_ptr], #0x10\n"
-          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "add %[b_ptr], %[b_ptr], #0x18\n"
-          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
-          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
-          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "b 4f\n"  // Complete write out
-
-        "3:\n"  // Odd tail
-          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-          "str q5, [%x[c_ptr]]\n"
-          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-          "str q13, [%x[c_ptr], #0x10]\n"
-          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-          "str q21, [%x[c_ptr], #0x20]\n"
-          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-          "str q6, [%x[c_ptr], #0x30]\n"
-          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-          "str q14, [%x[c_ptr], #0x40]\n"
-          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-          "str q22, [%x[c_ptr], #0x50]\n"
-          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-          "str q7, [%x[c_ptr], #0x60]\n"
-          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-          "str q15, [%x[c_ptr], #0x70]\n"
-          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-          "str q23, [%x[c_ptr], #0x80]\n"
-          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-          "str q8, [%x[c_ptr], #0x90]\n"
-          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-          "str q16, [%x[c_ptr], #0xa0]\n"
-          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-          "str q24, [%x[c_ptr], #0xb0]\n"
-          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-          "str q9, [%x[c_ptr], #0xc0]\n"
-          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-          "str q17, [%x[c_ptr], #0xd0]\n"
-          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-          "str q25, [%x[c_ptr], #0xe0]\n"
-          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-          "str q10, [%x[c_ptr], #0xf0]\n"
-          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-          "str q18, [%x[c_ptr], #0x100]\n"
-          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-          "str q26, [%x[c_ptr], #0x110]\n"
-          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-          "str q11, [%x[c_ptr], #0x120]\n"
-
-        "4:\n"  // End of function
-          "str q19, [%x[c_ptr], #0x130]\n"
-          "str q27, [%x[c_ptr], #0x140]\n"
-          "str q12, [%x[c_ptr], #0x150]\n"
-          "str q20, [%x[c_ptr], #0x160]\n"
-          "str q28, [%x[c_ptr], #0x170]\n"
-          "add %x[c_ptr], %x[c_ptr], #0x180\n"
-        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
-          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
-        : [odd_k] "r" (odd_k)
-        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
-      );
-    }
-  }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
deleted file mode 100644
index 62cd747d7c..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_u8_12x8/generic.hpp"
-#include "a64_gemm_u8_12x8/a55r1.hpp"
-
-class gemm_u8_12x8 {
-public:
-    typedef uint8_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 4;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 4;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 4;
-
-    kern_type kernel = nullptr;
-
-    gemm_u8_12x8(const CPUInfo *ci) {
-        kernel = a64_gemm_u8_12x8;
-        if (ci->CPU == CPUTarget::A55_DOT) {
-            kernel = a64_gemm_u8_12x8_a55r1;
-        }
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
deleted file mode 100644
index c7c2acbb49..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-    // We divide K by 4 because the udot instruction processes 4 elements at a time.
-    const int W = K/4;
-    // Fix up for odd lengths - set a flag if K is odd, but make
-    // sure we round up the iteration count.
-    const int oddk = (W & 1);
-    const int init_value_k = ((W+1)/2) - 1;
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int k = init_value_k;
-            register int32x4_t a0  asm("v0");
-            register int32x4_t a1  asm("v1");
-            register int32x4_t b0  asm("v2");
-            register int32x4_t b1  asm("v3");
-            register int32x4_t b2  asm("v4");
-            register int32x4_t a0a asm("v5");
-            register int32x4_t a1a asm("v6");
-            __asm __volatile (
-                _DECLARE_UDOT
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "movi	v11.4s, #0x0\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-
-                // Loop proper
-                "1:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "ins    %[a0].d[1], x20\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "ins    %[a1].d[1], x20\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "ins    %[b1].d[1], x20\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                // Detached final iteration (even K)
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "ins    %[b2].d[1], x20\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "ins    %[b2].d[1], x20\n"
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-
-
-                ".purgem udot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
-#endif
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
deleted file mode 100644
index 718232fb05..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_UDOT ".altmacro\n"\
-    ".macro udot opd:req, opn:req, opm:req\n"\
-    "local vd, vn, vm, h, l\n"\
-    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
-    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
-    ".set vd,\\reg\n"\
-    ".endif\n"\
-    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
-    ".set vn,\\reg\n"\
-    ".endif\n"\
-    ".irp idx,0,1,2,3\n"\
-    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
-    ".set vm,\\reg\n"\
-    ".set h,\\idx / 2\n"\
-    ".set l,\\idx %% 2\n"\
-    ".endif\n"\
-    ".endr\n"\
-    ".endr\n"\
-    ".ifndef vd\n"\
-    ".error \"Bad operand \\opd\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vn\n"\
-    ".error \"Bad operand \\opn\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef vm\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef h\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".ifndef l\n"\
-    ".error \"Bad operand \\opm\"\n"\
-    ".exitm\n"\
-    ".endif\n"\
-    ".int	 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
-    ".endm\n"\
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
deleted file mode 100644
index 3531eb6d25..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    assert(Apanel);
-    assert(Bpanel);
-    assert(Cpanel);
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-    // We divide K by 4 because the udot instruction processes 4 elements at a time.
-    const int W = K/4;
-    // Fix up for odd lengths - set a flag if K is odd, but make
-    // sure we round up the iteration count.
-    const int oddk = (W & 1);
-    const int init_value_k = ((W+1)/2) - 1;
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            int k = init_value_k;
-            register uint32x4_t a0  asm("v0");
-            register uint32x4_t a1  asm("v1");
-            register uint32x4_t b0  asm("v2");
-            register uint32x4_t b1  asm("v3");
-            register uint32x4_t b2  asm("v4");
-            register uint32x4_t a0a asm("v5");
-            register uint32x4_t a1a asm("v6");
-            __asm __volatile (
-                _DECLARE_UDOT
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                // Loop proper
-                "1:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
-
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr	%q[a0], [%[a_ptr], #64]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ldr	%q[a1], [%[a_ptr], #80]\n"
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #112]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-
-                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-
-                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-
-                ".purgem udot\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
- 
-        }
-    }
-
-
-}
-#endif 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
deleted file mode 100644
index 3561bfec96..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Load the actual kernel
-#include "a64_gemm_u8_4x4/generic.hpp"
-
-class gemm_u8_4x4 {
-public:
-    typedef uint8_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 4;
-    static const int A_block = 16;
-    static const bool A_transpose = false;
-
-    /* Same for B input */
-    static const int B_interleave = 4;
-    static const int B_block = 16;
-    static const bool B_transpose = true;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 4;
-    static const int out_height = 4;
-    static const int k_unroll = 16;
-
-    kern_type kernel = nullptr;
-
-    gemm_u8_4x4(const CPUInfo *ci) {
-        kernel = a64_gemm_u8_4x4;
-    }
-};
-
-#endif // __aarch64__
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
deleted file mode 100644
index aff3faf666..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-    K /= 16;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-
-            int k = K-1;
-
-            register uint8x16_t b0  asm("v4");
-            register uint8x16_t b1  asm("v5");
-            register uint8x16_t b2  asm("v6");
-            register uint8x16_t b3  asm("v7");
-
-            __asm __volatile (
-                "movi	v16.4s, #0x0\n"
-                "ldr	q0, [%[a_ptr]]\n"
-                "movi	v17.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v18.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v19.4s, #0x0\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "movi	v20.4s, #0x0\n"
-                "ldr	%q[b3], [%[b_ptr], #48]\n"
-                "movi	v21.4s, #0x0\n"
-                "ldr	q1, [%[a_ptr], #16]\n"
-                "movi	v22.4s, #0x0\n"
-                "ldr	q2, [%[a_ptr], #32]\n"
-                "movi	v23.4s, #0x0\n"
-                "ldr	q3, [%[a_ptr], #48]\n"
-                "movi	v24.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v25.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v26.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v27.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v28.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v29.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v30.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v31.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-
-                "umull	v12.8h, v0.8b, %[b0].8b\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "umull	v13.8h, v0.8b, %[b1].8b\n"
-                "umull	v14.8h, v0.8b, %[b2].8b\n"
-                "add	%[b_ptr], %[b_ptr], #64\n"
-                "umull	v15.8h, v0.8b, %[b3].8b\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 2f\n"
-
-                "1:\n"
-                "uadalp	v16.4s, v12.8h\n"
-                "umull2	v12.8h, v0.16b, %[b0].16b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull2	v13.8h, v0.16b, %[b1].16b\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull2	v14.8h, v0.16b, %[b2].16b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull2	v15.8h, v0.16b, %[b3].16b\n"
-                "ldr 	q0, [%[a_ptr]]\n"
-
-                "uadalp	v16.4s, v12.8h\n"
-                "umull	v12.8h, v1.8b, %[b0].8b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull	v13.8h, v1.8b, %[b1].8b\n"
-                "subs	%w[k], %w[k], #1\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull	v14.8h, v1.8b, %[b2].8b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull	v15.8h, v1.8b, %[b3].8b\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull2	v12.8h, v1.16b, %[b0].16b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull2	v13.8h, v1.16b, %[b1].16b\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "uadalp	v22.4s, v14.8h\n"
-                "umull2	v14.8h, v1.16b, %[b2].16b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull2	v15.8h, v1.16b, %[b3].16b\n"
-                "ldr 	q1, [%[a_ptr], #16]\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull	v12.8h, v2.8b, %[b0].8b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull	v13.8h, v2.8b, %[b1].8b\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "uadalp	v22.4s, v14.8h\n"
-                "umull	v14.8h, v2.8b, %[b2].8b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull	v15.8h, v2.8b, %[b3].8b\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull2	v12.8h, v2.16b, %[b0].16b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull2	v13.8h, v2.16b, %[b1].16b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull2	v14.8h, v2.16b, %[b2].16b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull2	v15.8h, v2.16b, %[b3].16b\n"
-                "ldr	q2, [%[a_ptr], #32]\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull	v12.8h, v3.8b, %[b0].8b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull	v13.8h, v3.8b, %[b1].8b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull	v14.8h, v3.8b, %[b2].8b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull	v15.8h, v3.8b, %[b3].8b\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "umull2	v12.8h, v3.16b, %[b0].16b\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "umull2	v13.8h, v3.16b, %[b1].16b\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "umull2	v14.8h, v3.16b, %[b2].16b\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "uadalp	v31.4s, v15.8h\n"
-                "umull2	v15.8h, v3.16b, %[b3].16b\n"
-                "ldr	%q[b3], [%[b_ptr], #48]\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "umull	v12.8h, v0.8b, %[b0].8b\n"
-                "add	%[b_ptr], %[b_ptr], #64\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "umull	v13.8h, v0.8b, %[b1].8b\n"
-                "ldr	q3, [%[a_ptr], #48]\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "umull	v14.8h, v0.8b, %[b2].8b\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "uadalp	v31.4s, v15.8h\n"
-                "umull	v15.8h, v0.8b, %[b3].8b\n"
-                "bne	1b\n"
-
-                // Branch target
-                "2:\n"
-                "uadalp	v16.4s, v12.8h\n"
-                "umull2	v12.8h, v0.16b, %[b0].16b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull2	v13.8h, v0.16b, %[b1].16b\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull2	v14.8h, v0.16b, %[b2].16b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull2	v15.8h, v0.16b, %[b3].16b\n"
-
-                "uadalp	v16.4s, v12.8h\n"
-                "umull	v12.8h, v1.8b, %[b0].8b\n"
-                "uadalp	v17.4s, v13.8h\n"
-                "umull	v13.8h, v1.8b, %[b1].8b\n"
-                "uadalp	v18.4s, v14.8h\n"
-                "umull	v14.8h, v1.8b, %[b2].8b\n"
-                "uadalp	v19.4s, v15.8h\n"
-                "umull	v15.8h, v1.8b, %[b3].8b\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull2	v12.8h, v1.16b, %[b0].16b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull2	v13.8h, v1.16b, %[b1].16b\n"
-                "uadalp	v22.4s, v14.8h\n"
-                "umull2	v14.8h, v1.16b, %[b2].16b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull2	v15.8h, v1.16b, %[b3].16b\n"
-
-                "uadalp	v20.4s, v12.8h\n"
-                "umull	v12.8h, v2.8b, %[b0].8b\n"
-                "uadalp	v21.4s, v13.8h\n"
-                "umull	v13.8h, v2.8b, %[b1].8b\n"
-                "uadalp	v22.4s, v14.8h\n"
-                "umull	v14.8h, v2.8b, %[b2].8b\n"
-                "uadalp	v23.4s, v15.8h\n"
-                "umull	v15.8h, v2.8b, %[b3].8b\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull2	v12.8h, v2.16b, %[b0].16b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull2	v13.8h, v2.16b, %[b1].16b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull2	v14.8h, v2.16b, %[b2].16b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull2	v15.8h, v2.16b, %[b3].16b\n"
-
-                "uadalp	v24.4s, v12.8h\n"
-                "umull	v12.8h, v3.8b, %[b0].8b\n"
-                "uadalp	v25.4s, v13.8h\n"
-                "umull	v13.8h, v3.8b, %[b1].8b\n"
-                "uadalp	v26.4s, v14.8h\n"
-                "umull	v14.8h, v3.8b, %[b2].8b\n"
-                "uadalp	v27.4s, v15.8h\n"
-                "umull	v15.8h, v3.8b, %[b3].8b\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "umull2	v12.8h, v3.16b, %[b0].16b\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "umull2	v13.8h, v3.16b, %[b1].16b\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "umull2	v14.8h, v3.16b, %[b2].16b\n"
-                "uadalp	v31.4s, v15.8h\n"
-                "umull2	v15.8h, v3.16b, %[b3].16b\n"
-
-                "uadalp	v28.4s, v12.8h\n"
-                "uadalp	v29.4s, v13.8h\n"
-                "uadalp	v30.4s, v14.8h\n"
-                "uadalp	v31.4s, v15.8h\n"
-
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-                "addp	v20.4s, v24.4s, v25.4s\n"
-                "addp	v21.4s, v26.4s, v27.4s\n"
-                "addp	v22.4s, v28.4s, v29.4s\n"
-                "addp	v23.4s, v30.4s, v31.4s\n"
-
-                "addp	v16.4s, v16.4s, v17.4s\n"
-                "addp	v17.4s, v18.4s, v19.4s\n"
-                "addp	v18.4s, v20.4s, v21.4s\n"
-                "addp	v19.4s, v22.4s, v23.4s\n"
-
-                "str	q16, [%[c_ptr]]\n"
-                "str	q17, [%[c_ptr], #16]\n"
-                "str	q18, [%[c_ptr], #32]\n"
-                "str	q19, [%[c_ptr], #48]\n"
-                "add	%[c_ptr], %[c_ptr], #64\n"
-
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
-              [k] "+r" (k)
-            :
-            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
-              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
deleted file mode 100644
index 5e7684f692..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-// Get the components we need to implement SGEMM.
-// Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time.
-#include "a64_hgemm_24x8/generic.hpp"
-#include "a64_hgemm_24x8/a55r1.hpp"
-
-// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
-//
-// The generic "gemm_opt" function will instantiate one of these (allowing
-// the constructor to pick a kernel implementation).
-class hgemm_24x8 {
-public:
-    typedef __fp16 operand_type;
-    typedef __fp16 result_type;
-
-    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-
-    static const int A_block = 1;
-    static const int A_interleave = 8;
-    static const bool A_transpose = false;
-
-    static const int B_block = 1;
-    static const int B_interleave = 24;
-    static const bool B_transpose = true;
-
-    static const int out_width = 24;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel = nullptr;
-
-    hgemm_24x8(const struct CPUInfo *ci) {
-        kernel = a64_hgemm_asimd_24x8;
-        if (ci->CPU == CPUTarget::A55_DOT) {
-            kernel = a64_hgemm_asimd_24x8_a55r1;
-        }
-    }
-
-};
-
-#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp
deleted file mode 100644
index 1789abb046..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (c) 201 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-    const __fp16 *a_ptr = Apanel;
-    __fp16 *c_ptr = Cpanel;
-
-    // Fix up for odd lengths - set a flag if K is odd, but make
-    // sure we round up the iteration count.
-    int oddk = (K & 1);
-    int k_iters = ((K+1)/2) - 1;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            int k = k_iters;
-            a_ptr = a_ptr0;
-
-            // As A55 requires 64-bit loads anyway, just use 64 bits of the
-            // "A" operands to save on "ins" instructions.  Since A55 is
-            // in-order, two sets of "A" operands and one set of "B" is
-            // sufficient.
-            register float16x8_t a0  asm("v0");
-            register float16x8_t a1  asm("v1");
-            register float16x8_t a0a asm("v2");
-            register float16x8_t a1a asm("v3");
-            register float16x8_t b0  asm("v4");
-            register float16x8_t b1  asm("v5");
-            register float16x8_t b2  asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.8h, #0x0\n"
-                "ldr	%d[a0], [%[a_ptr]]\n"
-                "movi	v9.8h, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.8h, #0x0\n"
-                "ldr	%d[a1], [%[a_ptr], #8]\n"
-                "movi	v11.8h, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.8h, #0x0\n"
-                "movi	v13.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v14.8h, #0x0\n"
-                "movi	v15.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v16.8h, #0x0\n"
-                "movi	v17.8h, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v18.8h, #0x0\n"
-                "movi	v19.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v20.8h, #0x0\n"
-                "movi	v21.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v22.8h, #0x0\n"
-                "movi	v23.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v24.8h, #0x0\n"
-                "movi	v25.8h, #0x0\n"
-                "movi	v26.8h, #0x0\n"
-                "movi	v27.8h, #0x0\n"
-                "movi	v28.8h, #0x0\n"
-                "movi	v29.8h, #0x0\n"
-                "movi	v30.8h, #0x0\n"
-                "movi	v31.8h, #0x0\n"
-
-                // The loop is offset by these two instructions which must
-                // always be executed.
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #16]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
-                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #24]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
-                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
-                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                // Unroll 1
-                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #32]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a1a].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1a].h[1]\n"
-                "fmla	v14.8h, %[b0].8h, %[a1a].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1a].h[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #40]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
-                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v20.8h, %[b1].8h, %[a1a].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v22.8h, %[b1].8h, %[a1a].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1a].h[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
-
-                "fmla	v28.8h, %[b2].8h, %[a1a].h[0]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "fmla	v29.8h, %[b2].8h, %[a1a].h[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v30.8h, %[b2].8h, %[a1a].h[2]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v31.8h, %[b2].8h, %[a1a].h[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "bne	1b\n"
-
-                "4:\n"
-
-                // Start final iteration - branch off to "odd" code before we load a0a
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "cbnz	%w[oddk], 2f\n"
-
-                // Even K continuation
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #16]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr]]")
-                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #24]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #128]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
-                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
-                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #256]")
-
-                "fmla 	v12.8h, %[b0].8h, %[a1a].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1a].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla	v14.8h, %[b0].8h, %[a1a].h[2]\n"
-                "fmla	v15.8h, %[b0].8h, %[a1a].h[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #40]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #448]")
-
-                "fmla	v20.8h, %[b1].8h, %[a1a].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1a].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #512]")
-                "fmla	v22.8h, %[b1].8h, %[a1a].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1a].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #576]")
-
-                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]")
-                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #704]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1a].h[0]\n"
-                "fmla	v29.8h, %[b2].8h, %[a1a].h[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v30.8h, %[b2].8h, %[a1a].h[2]\n"
-                "fmla	v31.8h, %[b2].8h, %[a1a].h[3]\n"
-                "b	3f\n"
-
-                "2:\n"
-
-                // Odd tail
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr]]")
-
-                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "add	%[a_ptr], %[a_ptr], #16\n"
-                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #128]")
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                ASM_PREFETCHW("[%[c_ptr], #256]")
-
-                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
-                ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #384]")
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #448]")
-
-                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #512]")
-                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #576]")
-                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]")
-                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #704]")
-
-                // Common tail
-                // A55 won't dual issue these stores with anything else, so
-                // simplest to do them all in this common code.
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "5:\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "=w" (a0), [a0a] "=w" (a0a), [a1] "=w" (a1), [a1a] "=w" (a1a),
-              [b0] "=w" (b0), [b1] "=w" (b1), [b2] "=w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
deleted file mode 100644
index 03e2bb95a3..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-    const __fp16 *a_ptr = Apanel;
-    __fp16 *c_ptr = Cpanel;
-    for (int yb=0; yb<ablocks; yb++) {
-        const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-            register float16x8_t a0  asm("v0");
-            register float16x8_t a0a asm("v1");
-            register float16x8_t b0  asm("v2");
-            register float16x8_t b1  asm("v3");
-            register float16x8_t b2  asm("v4");
-            register float16x8_t b0a asm("v5");
-            register float16x8_t b1a asm("v6");
-            register float16x8_t b2a asm("v7");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.8h, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.8h, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.8h, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v11.8h, #0x0\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "movi	v12.8h, #0x0\n"
-                "ldr	%q[b0a], [%[b_ptr], #48]\n"
-                "movi	v13.8h, #0x0\n"
-                "ldr	%q[b1a], [%[b_ptr], #64]\n"
-                "movi	v14.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v15.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v16.8h, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v17.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v18.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v19.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.8h, #0x0\n"
-                "movi	v21.8h, #0x0\n"
-                "movi	v22.8h, #0x0\n"
-                "movi	v23.8h, #0x0\n"
-                "movi	v24.8h, #0x0\n"
-                "movi	v25.8h, #0x0\n"
-                "movi	v26.8h, #0x0\n"
-                "movi	v27.8h, #0x0\n"
-                "movi	v28.8h, #0x0\n"
-                "movi	v29.8h, #0x0\n"
-                "movi	v30.8h, #0x0\n"
-                "movi	v31.8h, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	%q[a0a], [%[a_ptr], #16]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%q[b2a], [%[b_ptr], #80]\n"
-                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "fmla	v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #288]")
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
-                "ldr	%q[a0], [%[a_ptr], #32]\n"
-
-                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
-                "fmla	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
-                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
-                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
-                "fmla	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
-                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
-                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
-                "ldr	%q[b0a], [%[b_ptr], #48]\n"
-
-                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
-                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #352]")
-                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
-                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
-                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
-                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
-                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
-                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
-                "ldr	%q[b1a], [%[b_ptr], #64]\n"
-
-                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
-                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
-                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
-                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
-                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
-                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-
-                "bne	1b\n"
-                "4:\n"
-
-                // Jump to odd tail if necessary.
-                "cbnz	%w[oddk], 2f\n"
-
-                // Even tail.
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr	%q[a0a], [%[a_ptr], #16]\n"
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr	%q[b2a], [%[b_ptr], #80]\n"
-                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
-
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
-
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
-
-                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
-                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
-                "str	q8, [%[c_ptr]]\n"
-                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-
-                "fmla  	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-
-                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-
-                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-
-                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-
-                "fmla  	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-
-                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-
-                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-                "b	3f\n"
-
-                // Odd tail
-                "2:\n"
-                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "add	%[a_ptr], %[a_ptr], #16\n"
-                "str	q8, [%[c_ptr]]\n"
-                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-
-                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-
-                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-
-                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-
-                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-
-                "fmla  	v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-
-                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-
-                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
-
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a0a] "+w" (a0a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k),
-              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
deleted file mode 100644
index 603ad8dc0a..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "a64_sgemm_12x8/generic.hpp"
-#include "a64_sgemm_12x8/a53.hpp"
-#include "a64_sgemm_12x8/a55.hpp"
-#include "a64_sgemm_12x8/a55r1.hpp"
-
-
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class sgemm_12x8 {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
-
-    /* Describes the data layout for A input */
-    static const int A_interleave = 8;
-    static const int A_block = 1;
-    static const int A_transpose = 0;
-
-    /* Same for B input */
-    static const int B_interleave = 12;
-    static const int B_block = 1;
-    static const int B_transpose = 1;
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int out_height = 8;
-    static const int k_unroll = 1;
-
-    kern_type kernel{nullptr};
-
-    sgemm_12x8(const CPUInfo *ci) {
-        kernel = a64_sgemm_asimd_12x8;
-        if (ci->CPU == CPUTarget::A53) {
-            kernel = a64_sgemm_asimd_12x8_a53;
-        }
-        else if (ci->CPU == CPUTarget::A55) {
-            kernel = a64_sgemm_asimd_12x8_a55;
-        }
-        else if (ci->CPU == CPUTarget::A55_DOT) {
-            kernel = a64_sgemm_asimd_12x8_a55r1;
-        }
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
deleted file mode 100644
index 1c9b4b38fc..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                // Unroll 0
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "nop\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "nop\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Unroll 1
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "ldr	x20, [%[a_ptr], #72]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-                "ins	%[a0].d[1], x20\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "ldr	x20, [%[a_ptr], #88]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-                "ins	%[a1].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-
-                "nop\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-
-                "nop\n"
-                "nop\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-                "nop\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-
-                "nop\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-
-                "bne	1b\n"
-
-                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
-                "4:\n"
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration. (even K)
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-
-                "nop\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "nop\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b	3f\n"
-
-                // Detached final iteration. (odd K)
-                "2:\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "nop\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16,  [%[c_ptr], #16]\n"
-                "str	q24,  [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17,  [%[c_ptr], #64]\n"
-                "str	q25,  [%[c_ptr], #80]\n"
-                "str	q10,  [%[c_ptr], #96]\n"
-                "str	q18,  [%[c_ptr], #112]\n"
-                "str	q26,  [%[c_ptr], #128]\n"
-                "str	q11,  [%[c_ptr], #144]\n"
-                "str	q19,  [%[c_ptr], #160]\n"
-                "str	q27,  [%[c_ptr], #176]\n"
-                "str	q12,  [%[c_ptr], #192]\n"
-                "str	q20,  [%[c_ptr], #208]\n"
-                "str	q28,  [%[c_ptr], #224]\n"
-                "str	q13,  [%[c_ptr], #240]\n"
-                "str	q21,  [%[c_ptr], #256]\n"
-                "str	q29,  [%[c_ptr], #272]\n"
-                "str	q14,  [%[c_ptr], #288]\n"
-                "str	q22,  [%[c_ptr], #304]\n"
-                "str	q30,  [%[c_ptr], #320]\n"
-                "str	q15,  [%[c_ptr], #336]\n"
-                "str	q23,  [%[c_ptr], #352]\n"
-                "str	q31,  [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
-                
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
deleted file mode 100644
index 85d8a502f8..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                // Unroll 0
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "subs	%w[k], %w[k], #1\n"
-
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "ins	%[b0].d[1], x20\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Unroll 1
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "ins	%[b1].d[1], x20\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #72]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-                "ins	%[a0].d[1], x20\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #88]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-
-
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-                "ins	%[a1].d[1], x20\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-                "ins	%[b0].d[1], x20\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-
-
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "ins	%[b1].d[1], x20\n"
-
-
-                "bne	1b\n"
-
-                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
-                "4:\n"
-                "cbnz	%w[oddk], 2f\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                // Detached final iteration. (even K)
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-                "ins	%[a0a].d[1], x20\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-                "ins	%[a1a].d[1], x20\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-                "ins	%[b0].d[1], x20\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-                "ins	%[b1].d[1], x20\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b	3f\n"
-
-                // Detached final iteration. (odd K)
-                "2:\n"
-
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16,  [%[c_ptr], #16]\n"
-                "str	q24,  [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17,  [%[c_ptr], #64]\n"
-                "str	q25,  [%[c_ptr], #80]\n"
-                "str	q10,  [%[c_ptr], #96]\n"
-                "str	q18,  [%[c_ptr], #112]\n"
-                "str	q26,  [%[c_ptr], #128]\n"
-                "str	q11,  [%[c_ptr], #144]\n"
-                "str	q19,  [%[c_ptr], #160]\n"
-                "str	q27,  [%[c_ptr], #176]\n"
-                "str	q12,  [%[c_ptr], #192]\n"
-                "str	q20,  [%[c_ptr], #208]\n"
-                "str	q28,  [%[c_ptr], #224]\n"
-                "str	q13,  [%[c_ptr], #240]\n"
-                "str	q21,  [%[c_ptr], #256]\n"
-                "str	q29,  [%[c_ptr], #272]\n"
-                "str	q14,  [%[c_ptr], #288]\n"
-                "str	q22,  [%[c_ptr], #304]\n"
-                "str	q30,  [%[c_ptr], #320]\n"
-                "str	q15,  [%[c_ptr], #336]\n"
-                "str	q23,  [%[c_ptr], #352]\n"
-                "str	q31,  [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
deleted file mode 100644
index 295308053f..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                ASM_PREFETCH("[%[b_ptr], #256]")
-
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                ASM_PREFETCH("[%[a_ptr], #384]")
-
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                ASM_PREFETCH("[%[b_ptr], #512]")
-
-                "movi	v8.4s, #0x0\n"
-                "movi	v9.4s, #0x0\n"
-                "movi	v10.4s, #0x0\n"
-                "movi	v11.4s, #0x0\n"
-                "movi	v12.4s, #0x0\n"
-                "movi	v13.4s, #0x0\n"
-                "movi	v14.4s, #0x0\n"
-                "movi	v15.4s, #0x0\n"
-                "movi	v16.4s, #0x0\n"
-                "movi	v17.4s, #0x0\n"
-
-                "movi	v18.4s, #0x0\n"
-                "movi	v19.4s, #0x0\n"
-                "movi	v20.4s, #0x0\n"
-                "movi	v21.4s, #0x0\n"
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                "1:\n"
-                // Unroll 0
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                ASM_PREFETCH("[%[a_ptr], #448]")
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #576]")
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Unroll 1
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "ldr	%d[a0], [%[a_ptr], #64]\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #72]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "ldr	%d[a1], [%[a_ptr], #80]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "ins	%[a0].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[a_ptr], #88]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "ins	%[a1].d[1], x20\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #104]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #112]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #120]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                ASM_PREFETCH("[%[b_ptr], #640]")
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-
-
-                "bne	1b\n"
-
-                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
-                "4:\n"
-                "cbnz	%w[oddk], 2f\n"
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-
-                // Detached final iteration. (even K)
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%d[a0a], [%[a_ptr], #32]\n"
-
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[a_ptr], #40]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%d[a1a], [%[a_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "ins	%[a0a].d[1], x20\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[a_ptr], #56]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "ldr	%d[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "ins	%[a1a].d[1], x20\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #56]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "ldr	%d[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "ins	%[b0].d[1], x20\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "ldr	x20, [%[b_ptr], #72]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                 
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr	%d[b2], [%[b_ptr], #80]\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ins	%[b1].d[1], x20\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	x20, [%[b_ptr], #88]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "ins	%[b2].d[1], x20\n"
-
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b	3f\n"
-
-                // Detached final iteration. (odd K)
-                "2:\n"
-
-                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	%d[b2], [%[b_ptr], #32]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	x20, [%[b_ptr], #40]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins	%[b2].d[1], x20\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q8,  [%[c_ptr]]\n"
-                "str	q16,  [%[c_ptr], #16]\n"
-                "str	q24,  [%[c_ptr], #32]\n"
-                "str	q9,  [%[c_ptr], #48]\n"
-                "str	q17,  [%[c_ptr], #64]\n"
-                "str	q25,  [%[c_ptr], #80]\n"
-                "str	q10,  [%[c_ptr], #96]\n"
-                "str	q18,  [%[c_ptr], #112]\n"
-                "str	q26,  [%[c_ptr], #128]\n"
-                "str	q11,  [%[c_ptr], #144]\n"
-                "str	q19,  [%[c_ptr], #160]\n"
-                "str	q27,  [%[c_ptr], #176]\n"
-                "str	q12,  [%[c_ptr], #192]\n"
-                "str	q20,  [%[c_ptr], #208]\n"
-                "str	q28,  [%[c_ptr], #224]\n"
-                "str	q13,  [%[c_ptr], #240]\n"
-                "str	q21,  [%[c_ptr], #256]\n"
-                "str	q29,  [%[c_ptr], #272]\n"
-                "str	q14,  [%[c_ptr], #288]\n"
-                "str	q22,  [%[c_ptr], #304]\n"
-                "str	q30,  [%[c_ptr], #320]\n"
-                "str	q15,  [%[c_ptr], #336]\n"
-                "str	q23,  [%[c_ptr], #352]\n"
-                "str	q31,  [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
deleted file mode 100644
index c4a5875a31..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            // Fix up for odd lengths - set a flag if K is odd, but make
-            // sure we round up the iteration count.
-            int oddk = (K & 1);
-            int k = ((K+1)/2) - 1;
-
-            register float32x4_t a0  asm("v0");
-            register float32x4_t a1  asm("v1");
-            register float32x4_t b0  asm("v2");
-            register float32x4_t b1  asm("v3");
-            register float32x4_t b2  asm("v4");
-            register float32x4_t a0a asm("v5");
-            register float32x4_t a1a asm("v6");
-
-            __asm __volatile (
-                // Initialize result registers, load initial operands, prime prefetches.
-                "movi	v8.4s, #0x0\n"
-                "ldr	%q[a0], [%[a_ptr]]\n"
-                "movi	v9.4s, #0x0\n"
-                "ldr	%q[b0], [%[b_ptr]]\n"
-                "movi	v10.4s, #0x0\n"
-                "ldr	%q[a1], [%[a_ptr], #16]\n"
-                "movi	v11.4s, #0x0\n"
-                "ldr	%q[b1], [%[b_ptr], #16]\n"
-                "movi	v12.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi	v13.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi	v14.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi	v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi	v16.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi	v17.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi	v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi	v19.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi	v20.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #256]")
-                "movi	v21.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi	v22.4s, #0x0\n"
-                "movi	v23.4s, #0x0\n"
-                "movi	v24.4s, #0x0\n"
-                "movi	v25.4s, #0x0\n"
-                "movi	v26.4s, #0x0\n"
-                "movi	v27.4s, #0x0\n"
-                "movi	v28.4s, #0x0\n"
-                "movi	v29.4s, #0x0\n"
-                "movi	v30.4s, #0x0\n"
-                "movi	v31.4s, #0x0\n"
-
-                // Skip loop if we are doing zero iterations of it.
-                "cbz	%w[k], 4f\n"
-
-                // Loop proper
-                "1:\n"
-                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "fmla  	v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                ASM_PREFETCH("[%[a_ptr], #320]")
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #448]")
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr	%q[a0], [%[a_ptr], #64]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "ldr	%q[a1], [%[a_ptr], #80]\n"
-                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #96]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                ASM_PREFETCH("[%[b_ptr], #512]")
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #112]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "subs	%w[k], %w[k], #1\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "bne	1b\n"
-
-                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
-                "4:\n"
-
-                // Branch to alternative tail for odd K
-                "cbnz	%w[oddk], 2f\n"
-
-                // Detached final iteration (even K)
-                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr	%q[a0a], [%[a_ptr], #32]\n"
-                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr	%q[a1a], [%[a_ptr], #48]\n"
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr	%q[b0], [%[b_ptr], #48]\n"
-
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr	%q[b1], [%[b_ptr], #64]\n"
-
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "add	%[a_ptr], %[a_ptr], #64\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr	%q[b2], [%[b_ptr], #80]\n"
-
-                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], #96\n"
-                "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "str	q24, [%[c_ptr], #32]\n"
-
-                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                "b	3f\n"
-
-                // Detached final iteration (odd K)
-                "2:\n"
-                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr	%q[b2], [%[b_ptr], #32]\n"
-                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "str	q8, [%[c_ptr], #0]\n"
-                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "str	q16, [%[c_ptr], #16]\n"
-                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "add	%[b_ptr], %[b_ptr], #48\n"
-                "add	%[a_ptr], %[a_ptr], #32\n"
-                "str	q24, [%[c_ptr], #32]\n"
-                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "str	q9, [%[c_ptr], #48]\n"
-
-                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "str	q17, [%[c_ptr], #64]\n"
-                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "str	q25, [%[c_ptr], #80]\n"
-                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "str	q10, [%[c_ptr], #96]\n"
-
-                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "str	q18, [%[c_ptr], #112]\n"
-                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "str	q26, [%[c_ptr], #128]\n"
-                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "str	q11, [%[c_ptr], #144]\n"
-
-                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "str	q19, [%[c_ptr], #160]\n"
-                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "str	q27, [%[c_ptr], #176]\n"
-                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "str	q12, [%[c_ptr], #192]\n"
-
-                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "str	q20, [%[c_ptr], #208]\n"
-                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "str	q28, [%[c_ptr], #224]\n"
-                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "str	q13, [%[c_ptr], #240]\n"
-
-                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "str	q21, [%[c_ptr], #256]\n"
-                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "str	q29, [%[c_ptr], #272]\n"
-                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "str	q14, [%[c_ptr], #288]\n"
-
-                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "str	q22, [%[c_ptr], #304]\n"
-                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "str	q30, [%[c_ptr], #320]\n"
-                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "str	q15, [%[c_ptr], #336]\n"
-
-                // Common tail
-                "3:\n"
-                "str	q23, [%[c_ptr], #352]\n"
-                "str	q31, [%[c_ptr], #368]\n"
-                "add	%[c_ptr], %[c_ptr], #384\n"
-            :
-              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
-              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
-            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
-            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
-            );
-        }
-    }
-}
-
-inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp
deleted file mode 100644
index 2a39ca1f07..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-// Actual kernel implementations
-#include "generic.hpp"
-
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static const int out_width = 12;
-    static const int k_unroll = 1;
-
-    kern_type kernel;
-
-    sgemv_trans(const CPUInfo *ci) {
-        kernel = a64_sgemv_trans;
-    }
-};
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp
deleted file mode 100644
index 33f2b701cf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp
+++ /dev/null
@@ -1,913 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-// Kernel implementation - transposed GEMV
-//
-// The kernel will process "M" rows of A (= steps of dot product) and "N"
-// columns (= dot products total)
-//
-// General plan is to do as many columns simultaneously as possible - a
-// reasonable limit is half the NEON regfile = 64 total accumulators.
-//
-// It's possible that messing around with sub-blocking M and N can yield
-// higher performance, but that's left to the outer loop.  In this kernel we
-// process all of M at the same time.
-
-
-// How far ahead to prefetch for the first and subsequent prefetches.
-// These values work for A72 on JunoR2...
-
-#define FIRST_PFD 9
-#define PFD 6
-
-inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) {
-    const float *a_ptr_base = Astart;
-    float *y_ptr = Ystart;
-
-    register const float32x4_t va asm("v1") = vdupq_n_f32(alpha);
-
-    int firstpfd=FIRST_PFD;
-    if (firstpfd > M) {
-        firstpfd = (M-1);
-    }
-
-    int pfd = PFD;
-    if (pfd > M) {
-        pfd = (M-1);
-    }
-
-    ptrdiff_t jump = lda * sizeof(int);
-
-    for (;N>=96;N-=96) {
-        int k = M-1;
-
-        const float *a_ptr = a_ptr_base;
-        const float *x_ptr = Xstart;
-        const float *pf_ptr = a_ptr;
-        const float *firstpf_ptr = a_ptr;
-        const float *pf_limit = a_ptr + (M * lda);
-
-        for (int i=0; i<firstpfd; i++) {
-            prefetch_1x(firstpf_ptr);
-            firstpf_ptr += lda;
-        }
-
-        for (int i=0; i<pfd; i++) {
-            prefetch_5x(pf_ptr + 16);
-            pf_ptr += lda;
-        }
-
-        a_ptr_base += 96;
-
-        __asm __volatile (
-            "movi	v8.4s,#0x0\n"
-            "ldr	w0, [%[x_ptr]]\n"
-            "movi	v9.4s,#0x0\n"
-            "ldr	q2,  [%[a_ptr], #0]\n"
-            "movi	v10.4s,#0x0\n"
-            "ldr	q3,  [%[a_ptr], #0x10]\n"
-            "movi	v11.4s,#0x0\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "movi	v12.4s,#0x0\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            "movi	v13.4s,#0x0\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "movi	v14.4s,#0x0\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "movi	v15.4s,#0x0\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]")
-            "movi	v16.4s, #0x0\n"
-            "movi	v17.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #64]")
-            "movi	v18.4s, #0x0\n"
-            "movi	v19.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #128]")
-            "movi	v20.4s, #0x0\n"
-            "movi	v21.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #192]")
-            "movi	v22.4s, #0x0\n"
-            "movi	v23.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #256]")
-            "movi	v24.4s, #0x0\n"
-            "movi	v25.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #320]")
-            "movi	v26.4s, #0x0\n"
-            "movi	v27.4s, #0x0\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "movi	v28.4s, #0x0\n"
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "movi	v29.4s, #0x0\n"
-            "movi	v30.4s, #0x0\n"
-            "movi	v31.4s, #0x0\n"
-
-            // Skip everything if there are no iterations of the main loop to do.
-            "cbz	%w[k], 10f\n"
-
-            // Loop with all prefetches.  Exit this loop when firstpf_ptr
-            // hits pf_limit.
-            "1:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]")
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "sub	%w[k], %w[k], #1\n"
-            ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "cmp	%[firstpf_ptr], %[pf_limit]\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "blt	1b\n"
-
-            // Check that there are still "main" prefetches to do.
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "bge	9f\n"
-
-            // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
-            "8:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "sub	%w[k], %w[k], #1\n"
-            ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "blt	8b\n"
-
-            // Check that there is still work to do.
-            "9:\n"
-            "cmp	%w[k], #0\n"
-            "beq	10f\n"
-
-            // Loop without prefetches, exit when k hits 0.
-            "2:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "subs	%w[k], %w[k], #1\n"
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "bne	2b\n"
-
-            "10:\n"
-
-            // Final iteration
-            "dup	v0.4s, w0\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2,  [%[y_ptr]]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3,  [%[y_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4,  [%[y_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5,  [%[y_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "ldr	q6,  [%[y_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "ldr	q7,  [%[y_ptr], #0x50]\n"
-
-            "fmla	v2.4s, v8.4s, %[va].4s\n"
-            "ldr	q8, [%[y_ptr], #0x60]\n"
-            "fmla	v3.4s, v9.4s, %[va].4s\n"
-            "ldr	q9, [%[y_ptr], #0x70]\n"
-            "fmla	v4.4s, v10.4s, %[va].4s\n"
-            "ldr	q10, [%[y_ptr], #0x80]\n"
-            "fmla	v5.4s, v11.4s, %[va].4s\n"
-            "ldr	q11, [%[y_ptr], #0x90]\n"
-            "fmla	v6.4s, v12.4s, %[va].4s\n"
-            "ldr	q12, [%[y_ptr], #0xa0]\n"
-            "str	q2, [%[y_ptr], #0x00]\n"
-            "fmla	v7.4s, v13.4s, %[va].4s\n"
-            "ldr	q13, [%[y_ptr], #0xb0]\n"
-            "str	q3, [%[y_ptr], #0x10]\n"
-            "fmla	v8.4s, v14.4s, %[va].4s\n"
-            "ldr	q14, [%[y_ptr], #0xc0]\n"
-            "str	q4, [%[y_ptr], #0x20]\n"
-            "fmla	v9.4s, v15.4s, %[va].4s\n"
-            "ldr	q15, [%[y_ptr], #0xd0]\n"
-            "str	q5, [%[y_ptr], #0x30]\n"
-            "fmla	v10.4s, v16.4s, %[va].4s\n"
-            "ldr	q16, [%[y_ptr], #0xe0]\n"
-            "str	q6, [%[y_ptr], #0x40]\n"
-            "fmla	v11.4s, v17.4s, %[va].4s\n"
-            "ldr	q17, [%[y_ptr], #0xf0]\n"
-            "str	q7, [%[y_ptr], #0x50]\n"
-            "fmla	v12.4s, v18.4s, %[va].4s\n"
-            "ldr	q18, [%[y_ptr], #0x100]\n"
-            "str	q8, [%[y_ptr], #0x60]\n"
-            "fmla	v13.4s, v19.4s, %[va].4s\n"
-            "ldr	q19, [%[y_ptr], #0x110]\n"
-            "str	q9, [%[y_ptr], #0x70]\n"
-            "fmla	v14.4s, v20.4s, %[va].4s\n"
-            "ldr	q20, [%[y_ptr], #0x120]\n"
-            "str	q10, [%[y_ptr], #0x80]\n"
-            "fmla	v15.4s, v21.4s, %[va].4s\n"
-            "ldr	q21, [%[y_ptr], #0x130]\n"
-            "str	q11, [%[y_ptr], #0x90]\n"
-            "fmla	v16.4s, v22.4s, %[va].4s\n"
-            "ldr	q22, [%[y_ptr], #0x140]\n"
-            "str	q12, [%[y_ptr], #0xa0]\n"
-            "fmla	v17.4s, v23.4s, %[va].4s\n"
-            "ldr	q23, [%[y_ptr], #0x150]\n"
-            "str	q13, [%[y_ptr], #0xb0]\n"
-            "fmla	v18.4s, v24.4s, %[va].4s\n"
-            "ldr	q24, [%[y_ptr], #0x160]\n"
-            "str	q14, [%[y_ptr], #0xc0]\n"
-            "fmla	v19.4s, v25.4s, %[va].4s\n"
-            "ldr	q25, [%[y_ptr], #0x170]\n"
-            "str	q15, [%[y_ptr], #0xd0]\n"
-            "fmla	v20.4s, v26.4s, %[va].4s\n"
-            "str	q16, [%[y_ptr], #0xe0]\n"
-            "fmla	v21.4s, v27.4s, %[va].4s\n"
-            "str	q17, [%[y_ptr], #0xf0]\n"
-            "fmla	v22.4s, v28.4s, %[va].4s\n"
-            "str	q18, [%[y_ptr], #0x100]\n"
-            "fmla	v23.4s, v29.4s, %[va].4s\n"
-            "str	q19, [%[y_ptr], #0x110]\n"
-            "fmla	v24.4s, v30.4s, %[va].4s\n"
-            "str	q20, [%[y_ptr], #0x120]\n"
-            "fmla	v25.4s, v31.4s, %[va].4s\n"
-            "str	q21, [%[y_ptr], #0x130]\n"
-
-            "stp	q22, q23, [%[y_ptr], #0x140]\n"
-            "stp	q24, q25, [%[y_ptr], #0x160]\n"
-            "add	%[y_ptr], %[y_ptr], #0x180\n"
-
-          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr)
-          : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit)
-          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
-            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc"
-        );
-    }
-
-    if (N>0) {
-        // Handle N tail - up to 95 stragglers.
-        // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
-        // single value for the remainder.
-
-        // Independent pointers into the matrix for the odd 2 and odd 1.
-        // Double up as flag to indicate whether they are needed.
-        const float *odd2_aptr=NULL;
-        const float *odd1_aptr=NULL;
-
-        // Figure out how much work we need to do.
-        int numvecs = N/4;
-        int rem = N%4;
-        int k=M;
-
-        // Set up pointers for the odd 2/1 if needed.
-        if (rem >= 2) {
-            odd2_aptr = a_ptr_base + (numvecs * 4);
-        }
-
-        if (rem & 1) {
-            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2);
-        }
-
-        const float *a_ptr = a_ptr_base;
-        const float *firstpf_ptr = a_ptr_base;
-        const float *pf_ptr = a_ptr_base;
-        const float *pf_limit = a_ptr + (M * lda);
-
-        const float *x_ptr = Xstart;
-        int vecs=0; // Working variable to count how many vectors to work on.
-        int dopf=1; // Track whether we are doing prefetches.
-
-        // Figure out how many cache lines we need to prefetch each time.
-        int numpfs = (N + 15) / 16;
-
-        // Do initial prefetches
-        for (int i=0; i<firstpfd+1; i++) {
-            prefetch_1x(firstpf_ptr);
-            firstpf_ptr += lda;
-        }
-
-        // Do "main" prefetches - adapt number to the number we actually need.
-        if (numpfs > 1) {
-            for (int i=0; i<pfd+1; i++) {
-                switch (numpfs) {
-                    case 2:
-                        prefetch_1x(pf_ptr + 16);
-                        break;
-
-                    case 3:
-                        prefetch_2x(pf_ptr + 16);
-                        break;
-
-                    case 4:
-                        prefetch_3x(pf_ptr + 16);
-                        break;
-
-                    case 5:
-                        prefetch_4x(pf_ptr + 16);
-                        break;
-
-                    case 6:
-                        prefetch_5x(pf_ptr + 16);
-                        break;
-                }
-                pf_ptr += lda;
-            }
-        } else {
-            // Just disable additional prefetches
-            dopf=0;
-        }
-
-        // Do the real work
-        __asm __volatile (
-            // Initialize all the vectors - not worth skipping this if only
-            // some are needed.
-            "movi	v8.4s,#0x0\n"
-            "ldr	w0, [%[x_ptr]]\n"
-            "movi	v9.4s,#0x0\n"
-            "movi	v10.4s,#0x0\n"
-            "movi	v11.4s,#0x0\n"
-            "movi	v12.4s,#0x0\n"
-            "movi	v13.4s,#0x0\n"
-            "movi	v14.4s,#0x0\n"
-            "movi	v15.4s,#0x0\n"
-            "movi	v16.4s, #0x0\n"
-            "movi	v17.4s, #0x0\n"
-            "movi	v18.4s, #0x0\n"
-            "movi	v19.4s, #0x0\n"
-            "movi	v20.4s, #0x0\n"
-            "movi	v21.4s, #0x0\n"
-            "movi	v22.4s, #0x0\n"
-            "movi	v23.4s, #0x0\n"
-            "movi	v24.4s, #0x0\n"
-            "movi	v25.4s, #0x0\n"
-            "movi	v26.4s, #0x0\n"
-            "movi	v27.4s, #0x0\n"
-            "movi	v28.4s, #0x0\n"
-            "movi	v29.4s, #0x0\n"
-            "movi	v30.4s, #0x0\n"
-            "movi	v6.2s, #0x0\n"
-            "movi	v5.2s, #0x0\n"
-
-            "1:\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]\n")
-            "11:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #4\n"
-
-            "cbz	%w[numvecs], 2f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x00]\n"
-            "fmla	v8.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x10]\n"
-            "fmla	v9.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x20]\n"
-            "fmla	v10.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x30]\n"
-            "fmla	v11.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 3f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "3:\n"
-            "beq	2f\n"
-
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x40]\n"
-            "fmla	v12.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x50]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x60]\n"
-            "fmla	v14.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x70]\n"
-            "fmla	v15.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 4f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "4:\n"
-            "beq	2f\n"
-
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x80]\n"
-            "fmla	v16.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x90]\n"
-            "fmla	v17.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xa0]\n"
-            "fmla	v18.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xb0]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 5f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "5:\n"
-            "beq	2f\n"
-
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xc0]\n"
-            "fmla	v20.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xd0]\n"
-            "fmla	v21.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xe0]\n"
-            "fmla	v22.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xf0]\n"
-            "fmla	v23.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 6f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "6:\n"
-            "beq	2f\n"
-
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x100]\n"
-            "fmla	v24.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x110]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x120]\n"
-            "fmla	v26.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x130]\n"
-            "fmla	v27.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 7f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "7:\n"
-            "beq	2f\n"
-
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x140]\n"
-            "fmla	v28.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x150]\n"
-            "fmla	v29.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x160]\n"
-            "fmla	v30.4s, v7.4s, v0.4s\n"
-
-            "2:\n"
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-
-            // Do the odd 2-vector, if needed
-            "cbz	%[odd2_aptr], 8f\n"
-            "ldr	d7, [%[odd2_aptr]]\n"
-            "fmla	v6.2s, v7.2s, v0.2s\n"
-            "add	%[odd2_aptr], %[odd2_aptr], %[jump]\n"
-
-            "8:\n"
-            // Do the odd 1-vector, if needed
-            "cbz	%[odd1_aptr], 9f\n"
-            "ldr	s7, [%[odd1_aptr]]\n"
-            "fmla	v5.2s, v7.2s, v0.2s\n"
-            "add	%[odd1_aptr], %[odd1_aptr], %[jump]\n"
-
-            // Get out if needed.
-            "9:\n"
-            "subs	%w[k], %w[k], #1\n"
-            "beq	10f\n"
-
-            // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "csel	%w[dopf], %w[dopf], WZR, LT\n"
-
-            // Update the "leading" prefetch pointer, don't do the first
-            // instruction of the loop if it's over the limit.
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "cmp	%[firstpf_ptr], %[pf_limit]\n"
-            "blt	1b\n"
-            "b		11b\n"
-
-            // Now write out the outputs
-            "10:\n"
-            "cbz	%w[numvecs], 12f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v8.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v9.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v10.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v11.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v12.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v13.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v14.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v15.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v16.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v17.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v18.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v19.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v20.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v21.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v22.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v23.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v24.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v25.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v26.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v27.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v28.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v29.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v7.4s, v30.4s, %[va].4s\n"
-            "str	q7, [%[y_ptr]], #0x10\n"
-
-            // Odd 2
-            "12:\n"
-            "cbz	%[odd2_aptr], 13f\n"
-            "ldr	d7, [%[y_ptr]]\n"
-            "fmla	v7.2s, v6.2s, %[va].2s\n"
-            "str	d7, [%[y_ptr]], #0x8\n"
-
-            // Odd 1
-            "13:\n"
-            "cbz	%[odd1_aptr], 14f\n"
-            "ldr	s7, [%[y_ptr]]\n"
-            "fmla	v7.2s, v5.2s, %[va].2s\n"
-            "str	s7, [%[y_ptr]]\n"
-
-            "14:\n"
-          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k),
-            [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr),
-            [odd1_aptr] "+r" (odd1_aptr), [odd2_aptr] "+r" (odd2_aptr),
-            [dopf] "+r" (dopf), [vecs] "+r" (vecs)
-          : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit), [numvecs] "r" (numvecs)
-          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
-            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc"
-        );
-    }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
deleted file mode 100644
index 6731480fca..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-template<unsigned int width, unsigned int height, typename Tin, typename Tout>
-void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) {
-    int full_y_blocks = (ymax - y0) / height;
-    int y_remainder = (ymax - y0) % height;
-    int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
-
-    int full_x_blocks = (xmax - x0) / width;
-    int x_remainder = (xmax - x0) % width;
-    int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
-
-    for (int y_block = 0; y_block < y_blocks; y_block++) {
-        int ybase = y0 + (y_block * height);
-
-        int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
-
-        for (int x_block = 0; x_block < x_blocks; x_block++) {
-            int xbase = x0 + (x_block * width);
-
-            int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
-
-            for (int row=0; row < fill_rows; row++) {
-                for (int col=0; col < fill_cols; col++) {
-                    Tout &p = out[(ybase + row) * ldc + xbase + col];
-
-                    p = (p * alpha) + (beta * in[row * width + col]);
-                }
-            }
-
-            in += (width * height);
-        }
-    }
-}
-
-#include "merges/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp
deleted file mode 100644
index ddd67e8ee2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include "../asmlib.hpp"
-
-#include <arm_neon.h>
-
-template<>
-inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
-    const float *inptr = in;
-//    prefetch_6x(inptr);
-//    prefetch_6x(inptr + 96);
-
-    float32x4_t av = vdupq_n_f32(alpha);
-    float32x4_t bv = vdupq_n_f32(beta);
-
-    for (int y=y0; y<ymax; y+=8) {
-        float *outptr0 = out + (y * ldout) + x0;
-        float *outptr1 = outptr0 + ldout;
-        float *outptr2 = outptr1 + ldout;
-        float *outptr3 = outptr2 + ldout;
-        float *outptr4 = outptr3 + ldout;
-        float *outptr5 = outptr4 + ldout;
-
-//        prefetch_2x(outptr0);
-//        prefetch_2x(outptr1);
-//        prefetch_2x(outptr2);
-//        prefetch_2x(outptr3);
-//        prefetch_2x(outptr4);
-//        prefetch_2x(outptr5);
-
-        for (int i=x0; i<xmax; i+=8) {
-            float dummyres[8];
-
-            /* Make sure we throw away results if Y isn't a multiple of 8.
-             * We do this by pointing the result pointer at a dummy buffer
-             * we later discard.  */
-            if ((y+5) >= ymax) {
-                switch ((y + 5) - ymax) {
-                    case 4:
-                        outptr1 = dummyres;
-                    case 3:
-                        outptr2 = dummyres;
-                    case 2:
-                        outptr3 = dummyres;
-                    case 1:
-                        outptr4 = dummyres;
-                    case 0:
-                        outptr5 = dummyres;
-                    default:
-                        break;
-                }
-            }
-
-            /* For ragged X, manually copy over the valid results. */
-            if ((i+7) >= xmax) {
-                for (int xi=0; xi<8; xi++) {
-                    if ((i+xi) < xmax) {
-                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
-                        outptr0++;
-                        *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
-                        outptr1++;
-                        *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
-                        outptr2++;
-                        *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
-                        outptr3++;
-                        *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
-                        outptr4++;
-                        *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
-                        outptr5++;
-                    }
-                }
-                inptr += 48;
-            } else {
-                /* Optimized routine to copy an entire block */
-                __asm __volatile (
-                    // Rows 0-1
-                    "VLD1.32	{d8-d11},  [%[outptr0]]\n"
-                    "VMUL.f32	q4, q4, %q[bv]\n"
-                    "VLD1.32	{d12-d15}, [%[outptr1]]\n"
-                    "VMUL.f32	q5, q5, %q[bv]\n"
-                    "VLD1.32	{d0-d3},   [%[inptr]]!\n"
-                    "VMUL.f32	q6, q6, %q[bv]\n"
-                    "VLD1.32	{d4-d7},   [%[inptr]]!\n"
-                    "VMUL.f32	q7, q7, %q[bv]\n"
-
-                    "VMLA.f32	q4, q0, %q[av]\n"
-                    ASM_PREFETCH("[%[inptr], #352]")
-                    "VMLA.f32	q5, q1, %q[av]\n"
-                    "VST1.32	{d8-d11}, [%[outptr0]]!\n"
-                    ASM_PREFETCH("[%[inptr], #416]")
-                    "VMLA.f32	q6, q2, %q[av]\n"
-                    ASM_PREFETCH("[%[inptr], #480]")
-                    "VMLA.f32	q7, q3, %q[av]\n"
-                    "VST1.32	{d12-d15}, [%[outptr1]]!\n"
-
-                    // Rows 2-3
-                    "VLD1.32	{d8-d11},  [%[outptr2]]\n"
-                    "VMUL.f32	q4, q4, %q[bv]\n"
-                    "VLD1.32	{d12-d15}, [%[outptr3]]\n"
-                    "VMUL.f32	q5, q5, %q[bv]\n"
-                    "VLD1.32	{d0-d3},   [%[inptr]]!\n"
-                    "VMUL.f32	q6, q6, %q[bv]\n"
-                    "VLD1.32	{d4-d7},   [%[inptr]]!\n"
-                    "VMUL.f32	q7, q7, %q[bv]\n"
-
-                    "VMLA.f32	q4, q0, %q[av]\n"
-                    ASM_PREFETCH("[%[outptr0], #96]")
-                    "VMLA.f32	q5, q1, %q[av]\n"
-                    "VST1.32	{d8-d11}, [%[outptr2]]!\n"
-                    ASM_PREFETCH("[%[outptr1], #96]")
-                    "VMLA.f32	q6, q2, %q[av]\n"
-                    ASM_PREFETCH("[%[outptr2], #96]")
-                    "VMLA.f32	q7, q3, %q[av]\n"
-                    "VST1.32	{d12-d15}, [%[outptr3]]!\n"
-
-                    // Rows 4-5
-                    "VLD1.32	{d8-d11},  [%[outptr4]]\n"
-                    "VMUL.f32	q4, q4, %q[bv]\n"
-                    "VLD1.32	{d12-d15}, [%[outptr5]]\n"
-                    "VMUL.f32	q5, q5, %q[bv]\n"
-                    "VLD1.32	{d0-d3},   [%[inptr]]!\n"
-                    "VMUL.f32	q6, q6, %q[bv]\n"
-                    "VLD1.32	{d4-d7},   [%[inptr]]!\n"
-                    "VMUL.f32	q7, q7, %q[bv]\n"
-
-                    "VMLA.f32	q4, q0, %q[av]\n"
-                    ASM_PREFETCH("[%[outptr3], #96]")
-                    "VMLA.f32	q5, q1, %q[av]\n"
-                    "VST1.32	{d8-d11}, [%[outptr4]]!\n"
-                    ASM_PREFETCH("[%[outptr4], #96]")
-                    "VMLA.f32	q6, q2, %q[av]\n"
-                    ASM_PREFETCH("[%[outptr5], #128]")
-                    "VMLA.f32	q7, q3, %q[av]\n"
-                    "VST1.32	{d12-d15}, [%[outptr5]]!\n"
-                : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
-                  [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
-                : [av] "w" (av), [bv] "w" (bv)
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
-                );
-            }
-        }
-    }
-}
-
-#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
deleted file mode 100644
index e8edddb4f4..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "../asmlib.hpp"
-
-template<>
-inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
-    const float *inptr = in;
-    prefetch_6x(inptr);
-    prefetch_6x(inptr + 96);
-
-    float32x4_t av = vdupq_n_f32(alpha);
-    float32x4_t bv = vdupq_n_f32(beta);
-
-    for (int y=y0; y<ymax; y+=8) {
-        float *outptr0 = out + (y * ldout) + x0;
-        float *outptr1 = outptr0 + ldout;
-        float *outptr2 = outptr1 + ldout;
-        float *outptr3 = outptr2 + ldout;
-        float *outptr4 = outptr3 + ldout;
-        float *outptr5 = outptr4 + ldout;
-        float *outptr6 = outptr5 + ldout;
-        float *outptr7 = outptr6 + ldout;
-
-        prefetch_2x(outptr0);
-        prefetch_2x(outptr1);
-        prefetch_2x(outptr2);
-        prefetch_2x(outptr3);
-        prefetch_2x(outptr4);
-        prefetch_2x(outptr5);
-        prefetch_2x(outptr6);
-        prefetch_2x(outptr7);
-
-        for (int i=x0; i<xmax; i+=12) {
-            float dummyres[12];
-
-            /* Make sure we throw away results if Y isn't a multiple of 8.
-             * We do this by pointing the result pointer at a dummy buffer
-             * we later discard.  */
-            if ((y+7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        outptr1 = dummyres;
-                    case 5:
-                        outptr2 = dummyres;
-                    case 4:
-                        outptr3 = dummyres;
-                    case 3:
-                        outptr4 = dummyres;
-                    case 2:
-                        outptr5 = dummyres;
-                    case 1:
-                        outptr6 = dummyres;
-                    case 0:
-                        outptr7 = dummyres;
-                    default:
-                        break;
-                }
-            }
-
-            /* For ragged X, manually copy over the valid results. */
-            if ((i+11) >= xmax) {
-                for (int xi=0; xi<12; xi++) {
-                    if ((i+xi) < xmax) {
-                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
-                        outptr0++;
-                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
-                        outptr1++;
-                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
-                        outptr2++;
-                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
-                        outptr3++;
-                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
-                        outptr4++;
-                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
-                        outptr5++;
-                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
-                        outptr6++;
-                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
-                        outptr7++;
-                    }
-                }
-                inptr += 96;
-            } else {
-                /* Optimized routine to copy an entire block */
-                __asm __volatile (
-                    // Rows 0-1
-                    "LDP	q16, q17, [%[outptr0]]\n"
-                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR	q18, [%[outptr0], #32]\n"
-                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP	q19, q20, [%[outptr1]]\n"
-                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR	q21, [%[outptr1], #32]\n"
-                    ASM_PREFETCH("[%[inptr], #768]")
-                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP	q0,  q1,  [%[inptr]]\n"
-                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP	q2,  q3,  [%[inptr], #32]\n"
-                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP	q4,  q5,  [%[inptr], #64]\n"
-                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[inptr], #832]")
-                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
-                    "STP	q16, q17, [%[outptr0]], #32\n"
-                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
-                    "STR	q18, [%[outptr0]], #16\n"
-                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[inptr], #896]")
-                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
-                    "STP	q19, q20, [%[outptr1]], #32\n"
-                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
-                    "STR	q21, [%[outptr1]], #16\n"
-
-                    // Rows 2-3
-                    "LDP	q16, q17, [%[outptr2]]\n"
-                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR	q18, [%[outptr2], #32]\n"
-                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP	q19, q20, [%[outptr3]]\n"
-                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR	q21, [%[outptr3], #32]\n"
-                    ASM_PREFETCH("[%[inptr], #960]")
-                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP	q0,  q1,  [%[inptr], #96]\n"
-                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP	q2,  q3,  [%[inptr], #128]\n"
-                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP	q4,  q5,  [%[inptr], #160]\n"
-                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[inptr], #1024]")
-                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
-                    "STP	q16, q17, [%[outptr2]], #32\n"
-                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
-                    "STR	q18, [%[outptr2]], #16\n"
-                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[inptr], #1088]")
-                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
-                    "STP	q19, q20, [%[outptr3]], #32\n"
-                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
-                    "STR	q21, [%[outptr3]], #16\n"
-
-                    // Rows 4-5
-                    ASM_PREFETCH("[%[outptr0], #80]")
-                    "LDP	q16, q17, [%[outptr4]]\n"
-                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR	q18, [%[outptr4], #32]\n"
-                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP	q19, q20, [%[outptr5]]\n"
-                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR	q21, [%[outptr5], #32]\n"
-                    ASM_PREFETCH("[%[outptr1], #80]")
-                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP	q0,  q1,  [%[inptr], #192]\n"
-                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP	q2,  q3,  [%[inptr], #224]\n"
-                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP	q4,  q5,  [%[inptr], #256]\n"
-                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[outptr2], #80]")
-                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
-                    "STP	q16, q17, [%[outptr4]], #32\n"
-                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
-                    "STR	q18, [%[outptr4]], #16\n"
-                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[outptr3], #80]")
-                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
-                    "STP	q19, q20, [%[outptr5]], #32\n"
-                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
-                    "STR	q21, [%[outptr5]], #16\n"
-
-                    // Rows 6-7
-                    ASM_PREFETCH("[%[outptr4], #80]")
-                    "LDP	q16, q17, [%[outptr6]]\n"
-                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR	q18, [%[outptr6], #32]\n"
-                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP	q19, q20, [%[outptr7]]\n"
-                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR	q21, [%[outptr7], #32]\n"
-                    ASM_PREFETCH("[%[outptr5], #80]")
-                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP	q0,  q1,  [%[inptr], #288]\n"
-                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP	q2,  q3,  [%[inptr], #320]\n"
-                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP	q4,  q5,  [%[inptr], #352]\n"
-                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[outptr6], #128]")
-                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
-                    "STP	q16, q17, [%[outptr6]], #32\n"
-                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
-                    "STR	q18, [%[outptr6]], #16\n"
-                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
-                    ASM_PREFETCH("[%[outptr7], #128]")
-                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
-                    "STP	q19, q20, [%[outptr7]], #32\n"
-                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
-                    "STR	q21, [%[outptr7]], #16\n"
-                    "ADD	%[inptr], %[inptr], #384\n"
-                : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
-                  [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
-                  [inptr] "+r" (inptr)
-                : [av] "w" (av), [bv] "w" (bv)
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
-                );
-            }
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
new file mode 100644
index 0000000000..b7cc3d773b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <thread>
+
+extern int l1_cache_size;
+extern int l2_cache_size;
+extern int force_cpu;
+
+#ifdef __ANDROID__
+inline unsigned long      stoul( const std::string& str, std::size_t* pos = 0, int base = 10 )
+{
+        char *end;
+        const unsigned long ret = strtoul( str.c_str(), &end, base);
+        *pos = end - str.c_str();
+        return ret;
+}
+inline int       stoi( const std::string& str, std::size_t* pos = 0, int base = 10 )
+{
+        return atoi(str.c_str());        
+}
+#endif
+
+
+#ifndef BARE_METAL
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP      (1 << 10)
+#endif
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID        (1 << 11)
+#endif
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP      (1 << 20)
+#endif
+
+#define CPUINFO_HACK
+
+//unsigned int get_cpu_impl();
+
+
+/* CPU models - we only need to detect CPUs we have
+ * microarchitecture-specific code for.
+ *
+ * Architecture features are detected via HWCAPs.
+ */
+enum class CPUModel {
+    GENERIC    = 0x0001,
+    A53        = 0x0010,
+    A55r0      = 0x0011,
+    A55r1      = 0x0012,
+};
+
+class CPUInfo
+{
+private:
+    struct PerCPUData {
+        CPUModel  model      = CPUModel::GENERIC;
+        uint32_t  midr       = 0;
+        bool      model_set  = false;
+    };
+
+    std::vector<PerCPUData> _percpu={};
+
+    bool _cpuid   = false;
+    bool _fp16    = false;
+    bool _dotprod = false;
+
+    unsigned int L1_cache_size = 32768;
+    unsigned int L2_cache_size = 262144;
+
+    /* Convert an MIDR register value to a CPUModel enum value. */
+    CPUModel midr_to_model(const unsigned int midr) const {
+        CPUModel model;
+
+        // Unpack variant and CPU ID
+        int variant = (midr >> 20) & 0xF;
+        int cpunum = (midr >> 4) & 0xFFF;
+
+        /* Only CPUs we have code paths for are detected.  All other CPUs
+         * can be safely classed as "GENERIC"
+         */
+
+        switch(cpunum) {
+            case 0xd03:
+                model = CPUModel::A53;
+                break;
+
+            case 0xd05:
+                if (variant) {
+                    model = CPUModel::A55r1;
+                } else {
+                    model = CPUModel::A55r0;
+                }
+                break;
+
+            default:
+                model = CPUModel::GENERIC;
+                break;
+        }
+
+        return model;
+    }
+
+    /* If the CPUID capability is present, MIDR information is provided in
+       /sys.  Use that to populate the CPU model table.  */
+    void populate_models_cpuid() {
+        for (unsigned long int i=0; i<_percpu.size(); i++) {
+            std::stringstream str;
+            str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
+            std::ifstream file;
+
+            file.open(str.str(), std::ios::in);
+
+            if (file.is_open()) {
+                std::string line;
+
+                if (bool(getline(file, line))) {
+                    const unsigned long midr = stoul(line, nullptr, 16);
+
+                    _percpu[i].midr      = (midr & 0xffffffff);
+                    _percpu[i].model     = midr_to_model(_percpu[i].midr);
+                    _percpu[i].model_set = true;
+                }
+            }
+        }
+    }
+
+    /* If "long-form" cpuinfo is present, parse that to populate models. */
+    void populate_models_cpuinfo() {
+        std::regex   proc_regex("^processor.*(\\d+)$");
+        std::regex   imp_regex("^CPU implementer.*0x(..)$");
+        std::regex   var_regex("^CPU variant.*0x(.)$");
+        std::regex   part_regex("^CPU part.*0x(...)$");
+        std::regex   rev_regex("^CPU revision.*(\\d+)$");
+
+        std::ifstream file;
+        file.open("/proc/cpuinfo", std::ios::in);
+
+        if (file.is_open()) {
+            std::string line;
+            int midr=0;
+            int curcpu=-1;
+
+            while(bool(getline(file, line))) {
+                std::smatch match;
+
+                if (std::regex_match(line, match, proc_regex)) {
+                    std::string id = match[1];
+                    int newcpu=stoi(id, nullptr, 0);
+
+                    if (curcpu >= 0 && midr==0) {
+                        // Matched a new CPU ID without any description of the previous one - looks like old format.
+                        return;
+                    }
+
+                    if (curcpu >= 0) {
+                        _percpu[curcpu].midr      = midr;
+                        _percpu[curcpu].model     = midr_to_model(midr);
+                        _percpu[curcpu].model_set = true;
+
+                        printf("CPU %d: %x\n",curcpu,midr);
+                    }
+
+                    midr=0;
+                    curcpu=newcpu;
+
+                    continue;
+                }
+
+                if (std::regex_match(line, match, imp_regex)) {
+                    int impv = stoi(match[1], nullptr, 16);
+                    midr |= (impv << 24);
+                    continue;
+                }
+
+                if (std::regex_match(line, match, var_regex)) {
+                    int varv = stoi(match[1], nullptr, 16);
+                    midr |= (varv << 16);
+                    continue;
+                }
+
+                if (std::regex_match(line, match, part_regex)) {
+                    int partv = stoi(match[1], nullptr, 16);
+                    midr |= (partv << 4);
+                    continue;
+                }
+
+                if (std::regex_match(line, match, rev_regex)) {
+                    int regv = stoi(match[1], nullptr, 10);
+                    midr |= (regv);
+                    midr |= (0xf << 16);
+                    continue;
+                }
+            }
+
+            if (curcpu >= 0) {
+                _percpu[curcpu].midr      = midr;
+                _percpu[curcpu].model     = midr_to_model(midr);
+                _percpu[curcpu].model_set = true;
+
+                printf("CPU %d: %x\n",curcpu,midr);
+            }
+        }
+    }
+
+    /* Identify the maximum valid CPUID in the system.  This reads
+     * /sys/devices/system/cpu/present to get the information.  */
+    int get_max_cpus() {
+        int max_cpus = 1;
+
+#ifndef BARE_METAL
+        std::ifstream CPUspresent;
+        CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+        bool success = false;
+
+        if (CPUspresent.is_open()) {
+            std::string line;
+
+            if (bool(getline(CPUspresent, line))) {
+                /* The content of this file is a list of ranges or single values, e.g.
+                 * 0-5, or 1-3,5,7 or similar.  As we are interested in the
+                 * max valid ID, we just need to find the last valid
+                 * delimiter ('-' or ',') and parse the integer immediately after that.
+                 */
+                auto startfrom=line.begin();
+
+                for (auto i=line.begin(); i<line.end(); ++i) {
+                    if (*i=='-' || *i==',') {
+                        startfrom=i+1;
+                    }
+                }
+
+                line.erase(line.begin(), startfrom);
+
+                max_cpus = stoi(line, nullptr, 0) + 1;
+                success = true;
+            }
+        }
+
+        // Return std::thread::hardware_concurrency() as a fallback.
+        if (!success) {
+            max_cpus = std::thread::hardware_concurrency();
+        }
+#endif // !BARE_METAL
+
+        return max_cpus;
+    }
+
+public:
+    CPUInfo() {
+#ifndef BARE_METAL
+        unsigned long hwcaps = getauxval(AT_HWCAP);
+
+        if (hwcaps & HWCAP_CPUID) {
+            _cpuid = true;
+        }
+
+        if (hwcaps & HWCAP_ASIMDHP) {
+            _fp16 = true;
+        }
+
+        if (hwcaps & HWCAP_ASIMDDP) {
+            _dotprod = true;
+        }
+
+#ifdef __aarch64__
+        /* Pre-4.15 kernels don't have the ASIMDDP bit.
+         *
+         * Although the CPUID bit allows us to read the feature register
+         * directly, the kernel quite sensibly masks this to only show
+         * features known by it to be safe to show to userspace.  As a
+         * result, pre-4.15 kernels won't show the relevant bit in the
+         * feature registers either.
+         *
+         * So for now, use a whitelist of CPUs known to support the feature.
+         */
+        if (!_dotprod && _cpuid) {
+            /* List of CPUs with dot product support:         A55r1       A75r1       A75r2  */
+            const unsigned int dotprod_whitelist_masks[]  = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+            const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+            unsigned long cpuid;
+
+            __asm __volatile (
+                "mrs %0, midr_el1\n"
+                : "=r" (cpuid)
+                :
+                :
+            );
+
+            for (int i=0;dotprod_whitelist_values[i];i++) {
+                if ((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i]) {
+                    _dotprod = true;
+                    break;
+                }
+            }
+        }
+#endif
+        _percpu.resize(get_max_cpus());
+#endif
+        if (_cpuid) {
+            populate_models_cpuid();
+        } else {
+            populate_models_cpuinfo();
+        }
+    }
+
+    void set_fp16(const bool fp16) {
+        _fp16 = fp16;
+    }
+
+    void set_dotprod(const bool dotprod) {
+        _dotprod = dotprod;
+    }
+
+    void set_cpu_model(unsigned long cpuid, CPUModel model) {
+        if (_percpu.size() > cpuid) {
+            _percpu[cpuid].model     = model;
+            _percpu[cpuid].model_set = true;
+        }
+    }
+
+    bool has_fp16() const {
+        return _fp16;
+    }
+
+    bool has_dotprod() const {
+        return _dotprod;
+    }
+
+    CPUModel get_cpu_model(unsigned long cpuid) const {
+        if (cpuid < _percpu.size()) {
+            return _percpu[cpuid].model;
+        }
+
+        return CPUModel::GENERIC;
+    }
+
+    CPUModel get_cpu_model() const {
+#ifdef BARE_METAL
+        return get_cpu_model(0);
+#else
+        return get_cpu_model(sched_getcpu());
+#endif
+    }
+
+    unsigned int get_L1_cache_size() const {
+        return L1_cache_size;
+    }
+
+    void set_L1_cache_size(unsigned int size) {
+        L1_cache_size = size;
+    }
+
+    unsigned int get_L2_cache_size() const {
+        return L2_cache_size;
+    }
+
+    void set_L2_cache_size(unsigned int size) {
+        L2_cache_size = size;
+    }
+};
+
+CPUInfo *get_CPUInfo();
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
deleted file mode 100644
index f7a1d1c70c..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef CYCLE_PROFILING
-
-#include "../perf.h"
-
-class profiler {
-private:
-    static const int maxevents = 10000;
-    unsigned long times[maxevents];
-    unsigned long units[maxevents];
-    int events[maxevents];
-    int currentevent;
-    int countfd;
-
-public:
-    profiler() {
-        currentevent=0;
-        countfd=open_cycle_counter();
-    }
-
-    ~profiler() {
-        close(countfd);
-        int tots[5];
-        unsigned long counts[5];
-        unsigned long tunits[5];
-        const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
-
-        for (int i=1; i<5; i++) {
-            tots[i] = 0;
-            counts[i] = 0;
-            tunits[i] = 0;
-        }
-
-        printf("Profiled events:\n");
-        for (int i=0; i<currentevent; i++) {
-            tots[events[i]]++;
-            counts[events[i]] += times[i];
-            tunits[events[i]] += units[i];
-        }
-
-        printf("%20s  %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
-        for (int i=1; i<5; i++) {
-            printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]);
-        }
-    }
-
-    template <typename T>
-    void operator() (int i, unsigned long u, T func) {
-        if (currentevent==maxevents) {
-            func();
-        } else {
-            events[currentevent] = i;
-            units[currentevent] = u;
-            start_counter(countfd);
-            func();
-            long long cycs = stop_counter(countfd);
-            times[currentevent++] = cycs;
-        }
-    }
-};
-
-#else
-
-class profiler {
-public:
-    template <typename T>
-    void operator() (int i, unsigned long u, T func) {
-        func();
-    }
-};
-
-#endif
-
-#define PROFILE_PREPA 1
-#define PROFILE_PREPB 2
-#define PROFILE_KERNEL 3
-#define PROFILE_MERGE 4
-
-
diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp
deleted file mode 100644
index 717506f54c..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transform.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/*
- * Generic transform.
- *
- * Assuming the untransposed case, this works by first reading <BlockBy>
- * consecutive values from the first input row.  This same number of values
- * are then read from the next <IntBy-1> rows.  Now return to the first
- * input row and repeat.
- *
- * Need to cope with the work requested in either dimension not actually
- * being a multiple of the block sizes.
- */
-template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
-struct TransformImpl {
-    template <typename TOut, typename TIn>
-    static void Transform(TOut* out, const TIn* const in, const int stride,
-                          const int y0, const int ymax, const int x0, const int xmax) {
-        const int n_whole_y_blocks = (ymax - y0) / IntBy;
-        const int y_remainders = (ymax - y0) % IntBy;
-        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
-
-        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
-        const int x_remainders = (xmax - x0) % BlockBy;
-        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
-
-        // "Y" loop: advance down the rows of the source IntBy rows at a time.
-        // Set up fill_rows to show the number rows to copy from, and blank_rows
-        // for the number of blank rows to add.
-        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
-            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
-            int blank_rows = IntBy - fill_rows;
-
-            int y_base = y0 + (y_block * IntBy);
-
-            // So now advance along this block of rows, BlockBy columns at a time.
-            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
-                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
-                int blank_cols = BlockBy - fill_cols;
-
-                int x_base = x0 + (x_block * BlockBy);
-
-                for (int row = 0; row < fill_rows; row++) {
-                    for (int col = 0; col < fill_cols; col++) {
-                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
-                        if (Transposed) {
-                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
-                        } else {
-                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
-                        }
-                    }
-                    // "col" tail - row is in range but column is out of range.
-                    for (int col=0; col < blank_cols; col++) {
-                        *out++ = static_cast<TOut>(0);
-                    }
-                }
-                // "row" tail - row is out of range so fill with zeros always.
-                for (int row = 0; row < blank_rows; row++) {
-                    for (int col=0; col < (fill_cols + blank_cols); col++) {
-                        *out++ = static_cast<TOut>(0);
-                    }
-                }
-            }
-        }
-    }
-
-    template <typename T>
-    static inline void Transform(T* out, const T* const in, const int stride,
-                                 const int k0, const int kmax, const int x0, const int xmax) {
-        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
-    }
-};
-
-/*****************************************************************************/
-template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
-void Transform(
-  TOut* out, const TIn* const in, const int stride,
-  const int k0, const int kmax, const int x0, const int xmax
-) {
-  // Redirect to a specialised implementation predicated on argument size.
-  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
-    out, in, stride, k0, kmax, x0, xmax
-  );
-}
-/*****************************************************************************/
-
-#include "transforms/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp
deleted file mode 100644
index 4a1b5d2bf2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-    uint32_t zerobuff[8];
-
-    for (int y=y0; y<ymax; y+=6) {
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-
-        //prefetch_2x(inptr0);
-        //prefetch_2x(inptr1);
-        //prefetch_2x(inptr2);
-        //prefetch_2x(inptr3);
-        //prefetch_2x(inptr4);
-        //prefetch_2x(inptr5);
-
-        int x=(kmax-k0);
-        for (;x>7;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 5) >= ymax) {
-                switch ((y + 5) - ymax) {
-                    /* Everything falls through in here */
-                    case 4:
-                        inptr1 = zerobuff;
-                    case 3:
-                        inptr2 = zerobuff;
-                    case 2:
-                        inptr3 = zerobuff;
-                    case 1:
-                        inptr4 = zerobuff;
-                    case 0:
-                        inptr5 = zerobuff;
-                    default:
-                        break;
-                }
-            }
-
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "VLD1.32		{d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
-                "VLD1.32		{d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
-                "VLD1.32		{d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
-                "VZIP.32	q0, q4\n"     // q0=A0C0A1C1, q4 = A2C2A3C3
-                "VLD1.32		{d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
-                "VZIP.32	q2, q6\n"     // q2=B0D0B1D1, q6 = B2D2B3D3
-                "VLD1.32		{d16-d19}, [%[inptr4]]!\n"
-                "VLD1.32		{d20-d23}, [%[inptr5]]!\n"
-                "VZIP.32	q8, q10\n"    // q8=E0F0E1F1, q10 = E2F2E3F3
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "VZIP.32	q0, q2\n"    // q0 = A0B0C0D0, q2 = A1B1C1D1
-
-                // Store first elements
-                "VST1.32		{d0-d1}, [%[outptr]]!\n"
-                "VST1.32		{d16}, [%[outptr]]!\n"
-
-                "VZIP.32	q4, q6\n"    // q4 = A2B2C2D2, q6 = A3B3C3D3
-
-                // Store second elements
-                "VST1.32		{d4-d5}, [%[outptr]]!\n"
-                "VZIP.32	q1, q5\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "VST1.32		{d17}, [%[outptr]]!\n"
-                "VZIP.32	q3, q7\n"
-
-                // Store third elements
-                "VZIP.32	q9, q11\n"
-                "VST1.32		{d8-d9}, [%[outptr]]!\n"
-                "VZIP.32	q1, q3\n"
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "VST1.32		{d20}, [%[outptr]]!\n"
-
-                // Store fourth elements
-                "VZIP.32	q5, q7\n"
-                "VST1.32		{d12-d13}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "VST1.32		{d21}, [%[outptr]]!\n"
-
-                // Fifth
-                "VST1.32		{d2-d3}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "VST1.32		{d18}, [%[outptr]]!\n"
-
-                // Sixth
-                "VST1.32		{d6-d7}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "VST1.32		{d19}, [%[outptr]]!\n"
-
-                // Seventh
-                "VST1.32		{d10-d11}, [%[outptr]]!\n"
-                "VST1.32		{d22}, [%[outptr]]!\n"
-
-                // Eigth
-                "VST1.32		{d14-d15}, [%[outptr]]!\n"
-                "VST1.32		{d23}, [%[outptr]]!\n"
-
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-        }
-    }
-}
-
-#endif  // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp
deleted file mode 100644
index a7e17fa074..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 8x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 16x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 12x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 16 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-  __asm volatile (
-    "VLD1.32	{d0-d3}, [%[in0]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    : [in0] "+r" (in0),
-      [out] "+r" (out)
-    :
-    : "q0", "q1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
-  __asm volatile (
-    "VLD1.32	{d0-d3}, [%[in0]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]!\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    "VLD1.32	{d0-d3}, [%[in1]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    "SUB	%[out], %[out], #32\n"
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [out] "+r" (out)
-    :
-    : "q0", "q1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-  __asm __volatile (
-    "VLD1.32	{d0-d3}, [%[in0]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]!\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    "VLD1.32	{d0-d3}, [%[in1]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]!\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    "VLD1.32	{d0-d3}, [%[in2]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]!\n"
-    ASM_PREFETCH("[%[in2], #192]")
-    "VLD1.32	{d0-d3}, [%[in3]]!\n"
-    "VST1.32	{d0-d3}, [%[out]]\n"
-    ASM_PREFETCH("[%[in3], #192]")
-    "SUB	%[out], %[out], #96\n"
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [in2] "+r" (in2),
-      [in3] "+r" (in3),
-      [out] "+r" (out)
-    :
-    : "q0", "q1", "memory"
-  );
-}
-
-template <>
-template <>
-inline void TransformImpl<16, 1, true, 2, 2>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp
deleted file mode 100644
index ac84567b54..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint8_t *outptr = (uint8_t *)out;
-    const uint8_t *inptr = (uint8_t *)in;
-
-    uint8_t zerobuff[16];
-
-    for (int y=y0; y<ymax; y+=4) {
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-
-        int x=(kmax-k0);
-        for (;x>15;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 3) >= ymax) {
-                switch ((y + 3) - ymax) {
-                    /* Everything falls through in here */
-                    case 2:
-                        inptr1 = zerobuff;
-                    case 1:
-                        inptr2 = zerobuff;
-                    case 0:
-                        inptr3 = zerobuff;
-                    default:
-                        break;
-                }
-            }
-
-            __asm __volatile (
-                "LDR	q0, [%[inptr0]], #16\n"
-                ASM_PREFETCH("[%[inptr0], #176]")
-                "LDR	q1, [%[inptr1]], #16\n"
-                ASM_PREFETCH("[%[inptr1], #176]")
-                "STP	q0, q1, [%[outptr]], #32\n"
-                "LDR	q0, [%[inptr2]], #16\n"
-                ASM_PREFETCH("[%[inptr2], #176]")
-                "LDR	q1, [%[inptr3]], #16\n"
-                ASM_PREFETCH("[%[inptr3], #176]")
-                "STP	q0, q1, [%[outptr]], #32\n"
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [outptr] "+r" (outptr)
-                :
-                : "v0", "v1"
-            );
-        }
-
-        if (x>0) {
-            /* Need to duplicate this here, in case we didn't run the main loop. */
-            if ((y + 3) >= ymax) {
-                switch ((y + 3) - ymax) {
-                    /* Everything falls through in here */
-                    case 2:
-                        inptr1 = zerobuff;
-                    case 1:
-                        inptr2 = zerobuff;
-                    case 0:
-                        inptr3 = zerobuff;
-                    default:
-                        break;
-                }
-            }
-
-            /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
-            auto f = [&outptr, x](const uint8_t *&p) {
-                for (int i=0; i<16; i++) {
-                    if (i < x) {
-                        *outptr++ = *p++;
-                    } else {
-                        *outptr++ = 0;
-                    }
-                }
-            };
-
-            f(inptr0);
-            f(inptr1);
-            f(inptr2);
-            f(inptr3);
-        }
-    }
-}
-
-#endif  // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp
deleted file mode 100644
index bdc05473b4..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint16_t *outptr = (uint16_t *)out;
-    const uint16_t *inptr = (const uint16_t *)in;
-
-    uint16_t zerobuff[24];
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;x>7;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
-                    case 6:
-                        inptr1 = zerobuff;
-                    case 5:
-                        inptr2 = zerobuff;
-                    case 4:
-                        inptr3 = zerobuff;
-                    case 3:
-                        inptr4 = zerobuff;
-                    case 2:
-                        inptr5 = zerobuff;
-                    case 1:
-                        inptr6 = zerobuff;
-                    case 0:
-                        inptr7 = zerobuff;
-                }
-            }
-
-            int skippf = (x & 31);
-            __asm __volatile (
-                // Load up 8 elements (1 vector) from each of 8 sources.
-                "CBNZ	%w[skippf], 1f\n"
-                ASM_PREFETCH("[%[inptr0], #128]")
-                ASM_PREFETCH("[%[inptr1], #128]")
-                ASM_PREFETCH("[%[inptr2], #128]")
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "1:\n"
-
-                "LDR	q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
-                "LDR	q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
-                "LDR	q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
-                "LDR	q6, [%[inptr6]], #16\n"
-                "ZIP1	v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
-                "ZIP2	v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
-                "ZIP1	v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
-                "ZIP2	v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
-                "LDR	q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
-                "LDR	q5, [%[inptr5]], #16\n"
-                "LDR	q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
-                "LDR	q7, [%[inptr7]], #16\n"
-                "ZIP1	v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
-                "ZIP2	v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
-                "ZIP1	v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
-                "ZIP2	v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
-
-                "ZIP1	v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
-                "ZIP2	v20.8h,  v8.8h,  v9.8h\n"
-                "ZIP1	v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
-                "ZIP2	v21.8h, v10.8h, v11.8h\n"
-
-                "CBNZ	%w[skippf], 2f\n"
-                ASM_PREFETCH("[%[inptr4], #112]")
-                ASM_PREFETCH("[%[inptr5], #112]")
-                ASM_PREFETCH("[%[inptr6], #112]")
-                ASM_PREFETCH("[%[inptr7], #112]")
-                "2:\n"
-
-                "ZIP1	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v30.8h, v16.8h, v17.8h\n"
-                "ZIP1	v23.8h, v18.8h, v19.8h\n"
-                "ZIP2	v31.8h, v18.8h, v19.8h\n"
-
-                "ZIP1	v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
-                "ZIP2	v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
-                "STP	q14, q15, [%[outptr]], #32\n" // Write back first two elements
-
-                "ZIP1	v0.8h, v20.8h, v21.8h\n"
-                "ZIP2	v1.8h, v20.8h, v21.8h\n"
-                "STP	q0, q1, [%[outptr]], #32\n" // Write back next two elements
-
-                "ZIP1	v2.8h, v22.8h, v23.8h\n"
-                "ZIP2	v3.8h, v22.8h, v23.8h\n"
-                "STP	q2, q3, [%[outptr]], #32\n" // Write back next two elements
-
-                "ZIP1	v4.8h, v30.8h, v31.8h\n"
-                "ZIP2	v5.8h, v30.8h, v31.8h\n"
-                "STP	q4, q5, [%[outptr]], #32\n" // Write back last two elements
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                : [skippf] "r" (skippf)
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-                  "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
deleted file mode 100644
index bd5125afab..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint32_t *outptr = (uint32_t *)out;
-    const uint32_t *inptr = (uint32_t *)in;
-
-    uint32_t zerobuff[8];
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;x>7;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
-                    case 6:
-                        inptr1 = zerobuff;
-                    case 5:
-                        inptr2 = zerobuff;
-                    case 4:
-                        inptr3 = zerobuff;
-                    case 3:
-                        inptr4 = zerobuff;
-                    case 2:
-                        inptr5 = zerobuff;
-                    case 1:
-                        inptr6 = zerobuff;
-                    case 0:
-                        inptr7 = zerobuff;
-                    default:
-                        break;
-                }
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
-                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
-                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
-                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
-                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDP        q8, q9, [%[inptr4]], #32\n"
-                "LDP        q10, q11, [%[inptr5]], #32\n"
-                "LDP        q12, q13, [%[inptr6]], #32\n"
-                "ZIP1       v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDP        q14, q15, [%[inptr7]], #32\n"
-                "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v0.4s, v4.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "ZIP2       v17.4s, v2.4s, v6.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2       v18.4s, v8.4s, v12.4s\n"
-                "ZIP2       v19.4s, v10.4s, v14.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1       v16.4s, v1.4s, v5.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP1       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1       v18.4s, v9.4s, v13.4s\n"
-                "ZIP1       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v1.4s, v5.4s\n"
-                "ZIP2       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2       v18.4s, v9.4s, v13.4s\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-                "ZIP2       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif  // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp
deleted file mode 100644
index 3c9e05223d..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    float *outptr = out;
-    const __fp16 *inptr = in;
-
-    __fp16 zerobuff[8];
-
-    for (int y=y0; y<ymax; y+=8) {
-        const __fp16 *inptr0 = inptr + y * ldin + k0;
-        const __fp16 *inptr1 = inptr0 + ldin;
-        const __fp16 *inptr2 = inptr1 + ldin;
-        const __fp16 *inptr3 = inptr2 + ldin;
-        const __fp16 *inptr4 = inptr3 + ldin;
-        const __fp16 *inptr5 = inptr4 + ldin;
-        const __fp16 *inptr6 = inptr5 + ldin;
-        const __fp16 *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;x>7;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
-                    case 6:
-                        inptr1 = zerobuff;
-                    case 5:
-                        inptr2 = zerobuff;
-                    case 4:
-                        inptr3 = zerobuff;
-                    case 3:
-                        inptr4 = zerobuff;
-                    case 2:
-                        inptr5 = zerobuff;
-                    case 1:
-                        inptr6 = zerobuff;
-                    case 0:
-                        inptr7 = zerobuff;
-                    default:
-                        break;
-                }
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "FCVTL2	v1.4s, v0.8h\n"
-                "FCVTL	v0.4s, v0.4h\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "FCVTL2	v3.4s, v2.8h\n"
-                "FCVTL	v2.4s, v2.4h\n"
-                "FCVTL2	v5.4s, v4.8h\n"
-                "FCVTL	v4.4s, v4.4h\n"
-                "ZIP1	v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "FCVTL2	v7.4s, v6.8h\n"
-                "FCVTL	v6.4s, v6.4h\n"
-                "ZIP1	v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "FCVTL2	v9.4s, v8.8h\n"
-                "FCVTL	v8.4s, v8.4h\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "FCVTL2	v11.4s, v10.8h\n"
-                "FCVTL	v10.4s, v10.4h\n"
-                "FCVTL2	v13.4s, v12.8h\n"
-                "FCVTL	v12.4s, v12.4h\n"
-                "ZIP1	v18.4s, v8.4s, v12.4s\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "FCVTL2	v15.4s, v14.8h\n"
-                "FCVTL	v14.4s, v14.4h\n"
-                "ZIP1	v19.4s, v10.4s, v14.4s\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.4s, v0.4s, v4.4s\n"
-                "ZIP2	v17.4s, v2.4s, v6.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2	v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.4s, v10.4s, v14.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1	v16.4s, v1.4s, v5.4s\n"
-                "ZIP1	v17.4s, v3.4s, v7.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "STP	q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1	v18.4s, v9.4s, v13.4s\n"
-                "ZIP1	v19.4s, v11.4s, v15.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Fourth element
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2	v16.4s, v1.4s, v5.4s\n"
-                "ZIP2	v17.4s, v3.4s, v7.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2	v18.4s, v9.4s, v13.4s\n"
-                "ZIP2	v19.4s, v11.4s, v15.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp
deleted file mode 100644
index 6e07064a0c..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 6x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 12 x uint16_t specialisation
-  TransformImpl<12, 1, true, 2, 2>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 12x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 12 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR d1, [%[in0], #0x10]\n"
-    "STR d1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x18\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    : [in0] "+r" (in0),
-      [out] "+r" (out)
-    :
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "LDR d1, [%[in0], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x18\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR x21, [%[in1]]\n"
-    "LDR q2, [%[in1], #0x08]\n"
-    "INS v1.d[1], x21\n"
-    "ADD %x[in1], %x[in1], #0x18\n"
-    "STP q0, q1, [%[out]]\n"
-    "STR q2, [%x[out], #0x20]\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [out] "+r" (out)
-    :
-    : "x21", "v0", "v1", "v2", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-  __asm __volatile (
-    "LDR q0, [%x[in0]], #0x10\n"
-    "STR q0, [%x[out]]\n"
-    "LDR d1, [%x[in0]], #0x08\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    "STR d1, [%x[out], #0x10]\n"
-
-    "LDR q0, [%x[in1]], #0x10\n"
-    "STR q0, [%x[out], #0x18]\n"
-    "LDR d1, [%x[in1]], #0x08\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    "STR d1, [%x[out], #0x28]\n"
-
-    "LDR q0, [%x[in2]], #0x10\n"
-    "STR q0, [%x[out], #0x30]\n"
-    "LDR d1, [%x[in2]], #0x08\n"
-    ASM_PREFETCH("[%[in2], #192]")
-    "STR d1, [%x[out], #0x40]\n"
-
-    "LDR q0, [%x[in3]], #0x10\n"
-    "STR q0, [%x[out], #0x48]\n"
-    "LDR d1, [%x[in3]], #0x08\n"
-    ASM_PREFETCH("[%[in3], #192]")
-    "STR d1, [%x[out], #0x58]\n"
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [in2] "+r" (in2),
-      [in3] "+r" (in3),
-      [out] "+r" (out)
-    :
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 2, 2>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp
deleted file mode 100644
index 835e4d87aa..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-#include "transpose_interleave_common.hpp"
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL	v2.4s, v2.4h\n"
-        "STR    q2, [%[out], #32]\n"
-    : [in0] "+r" (in0), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL	v2.4s, v2.4h\n"
-        "LDR	q3, [%[in1]], #16\n"
-        "FCVTL2	v4.4s, v3.8h\n"
-        "FCVTL	v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "LDR	d5, [%[in1]], #16\n"
-        "FCVTL	v5.4s, v5.4h\n"
-        "STP    q4, q5, [%[out], #64]\n"
-    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        "LDR    d2, [%[in0]], #8\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "FCVTL	v2.4s, v2.4h\n"
-        "LDR	q3, [%[in1]], #16\n"
-        "FCVTL2	v4.4s, v3.8h\n"
-        "FCVTL	v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR	d5, [%[in1]], #8\n"
-        "FCVTL	v5.4s, v5.4h\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDR	q6, [%[in2]], #16\n"
-        "FCVTL2	v7.4s, v6.8h\n"
-        "FCVTL	v6.4s, v6.4h\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR	d8, [%[in2]], #8\n"
-        "FCVTL	v8.4s, v8.4h\n"
-        ASM_PREFETCH("[%[in2], #192]")
-        "LDR	q9, [%[in3]], #16\n"
-        "FCVTL2	v10.4s, v9.8h\n"
-        "FCVTL	v9.4s, v9.4h\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR	d11, [%[in3]], #8\n"
-        "FCVTL	v11.4s, v11.4h\n"
-        "STP    q10, q11, [%[out], #160]\n"
-        ASM_PREFETCH("[%[in3], #192]")
-
-    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 4, 2>::Transform(
-    float* out, const __fp16* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp
deleted file mode 100644
index b6565baa23..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 12x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 4, 4>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 24 x uint16_t specialisation
-  TransformImpl<24, 1, true, 2, 2>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 24x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<24, 1, true, 2, 2>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 24 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "STR    q2, [%[out], #32]\n"
-    : [in0] "+r" (in0), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "LDP	q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "LDR	q5, [%[in1]], #16\n"
-        "STP    q4, q5, [%[out], #64]\n"
-    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        "LDR    q2, [%[in0]], #16\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDP	q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR	q5, [%[in1]], #16\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDP	q6, q7, [%[in2]], #32\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR	q8, [%[in2]], #16\n"
-        ASM_PREFETCH("[%[in2], #192]")
-        "LDP	q9, q10, [%[in3]], #32\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR	q11, [%[in3]], #16\n"
-        "STP    q10, q11, [%[out], #160]\n"
-        ASM_PREFETCH("[%[in3], #192]")
-
-    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-template <>
-template <>
-inline void TransformImpl<24, 1, true, 2, 2>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif  // __arch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
deleted file mode 100644
index 231b3f181e..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-template <unsigned int IntBy, typename TIn, typename TOut>
-struct TransposeInterleaveCommon {
-  // Override the moveblock_1xY methods to improve performance
-  static inline void moveblock_1x1(const TIn *&in0, TOut *out) {
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in0++);
-    }
-  }
-
-  static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) {
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in0++);
-    }
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in1++);
-    }
-  }
-
-  static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) {
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in0++);
-    }
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in1++);
-    }
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in2++);
-    }
-    for (unsigned int i = 0; i < IntBy; i++) {
-      *out++ = static_cast<TOut>(*in3++);
-    }
-  }
-
-  static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
-    const auto ldin = stride;
-
-    TOut *outarray = out;
-    const TIn *inarray = in;
-    TOut *outptr_base = outarray;
-    const TIn *inptr_base = inarray + x0 + (k0 * ldin);
-    int ldout = (kmax - k0) * IntBy;
-
-    int k=(kmax-k0);
-    for ( ; k>3; k-=4) {
-        TOut *outptr = outptr_base;
-        const TIn *inptr = inptr_base;
-        const TIn *inptr1 = inptr + ldin;
-        const TIn *inptr2 = inptr1 + ldin;
-        const TIn *inptr3 = inptr2 + ldin;
-
-        prefetch_3x(inptr);
-        prefetch_3x(inptr1);
-        prefetch_3x(inptr2);
-        prefetch_3x(inptr3);
-
-        outptr_base += IntBy * 4;
-        inptr_base += ldin * 4;
-
-        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
-            moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
-            outptr += ldout;
-        }
-    }
-
-    if (k) {
-        TOut *outptr = outptr_base;
-        const TIn *inptr = inptr_base;
-        const TIn *inptr1 = inptr + ldin;
-        const TIn *inptr2 = inptr1 + ldin;
-
-        prefetch_3x(inptr);
-        prefetch_3x(inptr1);
-        prefetch_3x(inptr2);
-
-        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
-            switch(k) {
-                case 3:
-                    moveblock_1x2(inptr, inptr1, outptr);
-                    moveblock_1x1(inptr2, outptr + IntBy * 2);
-                    break;
-
-                case 2:
-                    moveblock_1x2(inptr, inptr1, outptr);
-                    break;
-
-                case 1:
-                    moveblock_1x1(inptr, outptr);
-                    break;
-                default:
-                    break;
-            }
-
-            outptr  += ldout;
-        }
-    }
-
-    // Cope with ragged X cases
-    const unsigned int overflow = (xmax - x0) % IntBy;
-    if (overflow) {
-        const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
-        TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
-
-        for (int k=(kmax-k0); k>0; k--) {
-            const TIn *inptr = inptr_base;
-            inptr_base += ldin;
-
-            for (unsigned int x=0; x < IntBy; x++) {
-                TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
-                *outptr++ = val;
-            }
-        }
-    }
-}
-};
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
new file mode 100644
index 0000000000..2b304b8022
--- /dev/null
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_ASSEMBLY_HELPER_H__
+#define __ARM_ASSEMBLY_HELPER_H__
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+template <typename TypeInput, typename TypeOutput>
+class AssemblyKernelGlue final
+{
+public:
+    using TypeOperator = TypeInput;
+    using TypeResult   = TypeOutput;
+    AssemblyKernelGlue()
+        : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr)
+    {
+    }
+    using AssemblyGemm = arm_gemm::GemmCommon<TypeInput, TypeOutput>;
+
+    const AssemblyKernelGlue<TypeInput, TypeOutput> &operator=(const AssemblyKernelGlue<TypeInput, TypeOutput> &) = delete;
+    AssemblyKernelGlue(const AssemblyKernelGlue<TypeInput, TypeOutput> &) = delete;
+
+    std::unique_ptr<AssemblyGemm> _gemm_kernel_asm;
+    std::unique_ptr<INEKernel>    _optimised_kernel;
+    const ITensor                *_a;
+    const ITensor                *_b;
+    ITensor                      *_d;
+
+    /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel.
+     *  The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2)
+     */
+    inline void run()
+    {
+        const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+        const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+        const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
+
+        // Configure kernel window
+        Window     window  = calculate_max_window(*_d->info());
+        const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
+
+        // Only iterate over batches
+        Window win(window);
+        win.set(0, Window::Dimension(0, 1, 1));
+        win.set(1, Window::Dimension(0, 1, 1));
+        Iterator in0(_a, window);
+        Iterator out(_d, window);
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto in0_ptr = reinterpret_cast<const TypeInput *>(in0.ptr());
+            auto       out_ptr = reinterpret_cast<TypeOutput *>(out.ptr());
+            _gemm_kernel_asm->set_arrays(in0_ptr, lda, in1_ptr, ldb, out_ptr, ldd);
+            NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
+        },
+        in0, out);
+    }
+};
+
+using AssemblyKernelGlueF32   = AssemblyKernelGlue<float, float>;
+using AssemblyKernelGlueU8U32 = AssemblyKernelGlue<uint8_t, uint32_t>;
+using AssemblyKernelGlueS8S32 = AssemblyKernelGlue<int8_t, int32_t>;
+
+inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup &memory_group, size_t alignment, unsigned int num_threads)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
+    workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment - 1) * num_threads }, 1, DataType::S8));
+    workspace.allocator()->allocate();
+}
+
+template <typename T>
+std::unique_ptr<NEGEMMAssemblyWrapper<T>> create_wrapper_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+{
+    // rework this function, why are we checking data type and other things here ? should we create another function can_run_optimised_kernel() ?
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+    {
+        return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
+    }
+#elif defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+    {
+        return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
+    }
+    else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+    return nullptr;
+}
+
+template <typename T>
+inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta,
+                                  Tensor &workspace, MemoryGroup &memory_group, T &asm_glue)
+{
+    const ::CPUInfo *ci          = get_CPUInfo();
+    const int        M           = d->info()->tensor_shape().y();
+    const int        N           = d->info()->tensor_shape().x();
+    const int        K           = a->info()->tensor_shape().x();
+    unsigned int     num_threads = NEScheduler::get().num_threads();
+    // unique_ptr to a Gemm object
+    std::unique_ptr<typename T::AssemblyGemm> asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(*ci, M, N, K, false, false, alpha, beta, num_threads,
+                                                                                                                        false));
+
+    // arm_compute wrapper for the Gemm object (see above)
+    std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>> acl_gemm_wrapper = create_wrapper_kernel<typename T::AssemblyGemm>(a, b, c, d, alpha, beta);
+    if(acl_gemm_wrapper != nullptr && asm_gemm != nullptr)
+    {
+        acl_gemm_wrapper->configure(asm_gemm.get());
+        const size_t workspace_size = asm_gemm->get_working_size();
+        if(workspace_size)
+        {
+            // Allocate workspace
+            allocate_workspace(workspace_size, workspace, memory_group, 4096, num_threads);
+            asm_gemm->set_working_space(reinterpret_cast<typename T::TypeResult *>(workspace.buffer()));
+        }
+        const unsigned int window_size = asm_gemm->get_window_size();
+        if(window_size < num_threads)
+        {
+            num_threads = window_size;
+            asm_gemm->set_nthreads(num_threads);
+        }
+        asm_glue._gemm_kernel_asm  = std::move(asm_gemm);
+        asm_glue._optimised_kernel = std::move(acl_gemm_wrapper);
+        // We need to setup the ptrs in the run() method
+        asm_glue._a = a;
+        asm_glue._b = b;
+        asm_glue._d = d;
+        return true;
+    }
+    return false;
+}
+}
+#endif /* __ARM_ASSEMBLY_HELPER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index f2b6ef77bd..5279995be4 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -25,7 +25,6 @@
 #define __ARM_COMPUTE_NEGEMM_H__
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
@@ -35,6 +34,8 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -73,19 +74,19 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                               _memory_group;
-    NEGEMMInterleave4x4Kernel                 _interleave_kernel;
-    NEGEMMTranspose1xWKernel                  _transpose_kernel;
-    NEGEMMMatrixMultiplyKernel                _mm_kernel;
-    std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
-    NEGEMMMatrixAdditionKernel                _ma_kernel;
-    Tensor                                    _tmp_a;
-    Tensor                                    _tmp_b;
-    Tensor                                    _workspace;
-    bool                                      _run_vector_matrix_multiplication;
-    bool                                      _run_addition;
-    bool                                      _is_first_run;
-    bool                                      _reshape_b_only_on_first_run;
+    MemoryGroup                _memory_group;
+    NEGEMMInterleave4x4Kernel  _interleave_kernel;
+    NEGEMMTranspose1xWKernel   _transpose_kernel;
+    NEGEMMMatrixMultiplyKernel _mm_kernel;
+    AssemblyKernelGlueF32      _asm_glue;
+    NEGEMMMatrixAdditionKernel _ma_kernel;
+    Tensor                     _tmp_a;
+    Tensor                     _tmp_b;
+    Tensor                     _workspace;
+    bool                       _run_vector_matrix_multiplication;
+    bool                       _run_addition;
+    bool                       _is_first_run;
+    bool                       _reshape_b_only_on_first_run;
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index ac5f4caa78..4ae8ee1fb3 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -36,6 +36,7 @@
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -149,22 +150,14 @@ private:
      * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
      */
     void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Prepare the appropriate assembly optimized kernel
-     *
-     * @param[in] ci CPU information
-     * @param[in] M  M parameter of matrix multiplication
-     * @param[in] N  N parameter of matrix multiplication
-     * @param[in] K  K parameter of matrix multiplication
-     */
-    void configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K);
 
 private:
+    AssemblyKernelGlueF32                               _asm_glue;
     MemoryGroup                                         _memory_group;
     NEIm2ColKernel                                      _input_im2col_kernel;
     NEGEMMInterleave4x4Kernel                           _input_interleave_kernel;
     NEConvolutionLayerReshapeWeights                    _reshape_weights;
     NEGEMMMatrixMultiplyKernel                          _mm_kernel;
-    std::unique_ptr<NEGEMMAssemblyBaseKernel>           _mm_optimised_kernel;
     NEGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
     NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
     NECol2ImKernel                                      _output_col2im_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
index 3d213a7668..f09c94e726 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -58,6 +59,8 @@ public:
 
 private:
     MemoryGroup                _memory_group;
+    AssemblyKernelGlueU8U32    _asm_glue_unsigned;
+    AssemblyKernelGlueS8S32    _asm_glue_signed;
     std::unique_ptr<INEKernel> _mm_kernel;
     std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
     std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index eddb3a26b7..95776f829a 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -48,7 +49,6 @@ class ITensor;
  * otherwise if the DOT product instruction is available:
  *
  *  -# @ref NEGEMMInterleaveBlockedKernel
- *  -# @ref NEGEMMLowpAArch64V8P4Kernel
  *  -# @ref NEGEMMLowpOffsetContributionKernel
  *
 */
@@ -90,6 +90,8 @@ public:
 
 private:
     MemoryGroup                        _memory_group;
+    AssemblyKernelGlueU8U32            _asm_glue_unsigned;
+    AssemblyKernelGlueS8S32            _asm_glue_signed;
     std::unique_ptr<INEKernel>         _mm_kernel;
     std::unique_ptr<INEKernel>         _mtx_a_reshape_kernel;
     std::unique_ptr<INEKernel>         _mtx_b_reshape_kernel;
author	Pablo Tello <pablo.tello@arm.com>	2018-02-23 13:43:50 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:49:16 +0000
commit	eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0 (patch)
tree	42cca378eed97c07348f28e1ec708d9c7ed531ce /arm_compute
parent	8df6c452820719d201ee79596cde8445c2071db5 (diff)
download	ComputeLibrary-eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0.tar.gz