From 27066c2bed8fb88843308a70f375fd49835edd55 Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Thu, 23 Nov 2017 11:01:10 +0000
Subject: COMPMID-632: Integrated Assembly kernel GEMM S8 for Arm Cortex-A53.

Change-Id: I07faa0c984759a1b5db1e5de71f4ab3eef5888d8
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110334
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
---
 .../functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp  | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp')

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 1bf437eb5f..0423777217 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -41,10 +42,10 @@
 namespace arm_compute
 {
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-
 } // namespace arm_compute
 
 using namespace arm_compute;
@@ -91,7 +92,19 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITe
     }
     else
 #elif defined(ARM_COMPUTE_AARCH64_V8A)
-    if(1)
+    if(ci.CPU == CPUTarget::A53)
+    {
+        // Configure matrix multiply kernel
+        GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else if(1) // Generic v8a kernel
     {
         switch(a->info()->data_type())
         {
-- 
cgit v1.2.1