COMPMID-662: Integrated the new a64_s8_gemm_12x8 + dot product kernel into ACL.

Change-Id: Id8f919e486a132fc58346c9f84fccbeeb83d19b3 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94233 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: Pablo Tello <pablo.tello@arm.com> 2017-11-02 16:09:35 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:35:24 +0000
commit: 6ff12a0f7765f62b8d0fa8554021e1cac2789f19 (patch)
tree: 9338db697789106b49ea391634be8b3c08ef9f97 /arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
parent: 97988a4b3ef0f840432daf95b6e4b2ad7e5feefd (diff)
download: ComputeLibrary-6ff12a0f7765f62b8d0fa8554021e1cac2789f19.tar.gz
1 files changed, 7 insertions, 6 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
index a186d88355..659ef837f5 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
@@ -24,6 +24,7 @@
 #pragma once
 
 #include <stdio.h>
+#include <cassert>
 
 #include "gemm_common.hpp"
 #include "profiler.hpp"
@@ -114,12 +115,13 @@ public:
         // Work out the rounded size of M - needed for some buffers.
         Mround = (M + (strat.out_height - 1)) / strat.out_height;
         Mround *= strat.out_height;
+
     }
 
     // Actually execute the GEMM.
     void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
+        assert(working_space);
         profiler prof;
-
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
         intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
         size_t diff = 0;
@@ -128,7 +130,6 @@ public:
             diff = 0x10 - (working_space_int & 0xF);
         }
 
-        // TODO: Multithreaded implementations could share the burden of transforming these blocks.
         Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
         Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
         Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
@@ -141,7 +142,7 @@ public:
             int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
             kern_k *= strat.k_unroll;
 
-            prof(PROFILE_PREPA, [&](void) {
+            prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) {
                 if (trA ^ strategy::A_transpose) {
                     Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
                 } else {
@@ -155,7 +156,7 @@ public:
 
                 int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
 
-                prof(PROFILE_PREPB, [&](void) {
+                prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) {
                     if (trB ^ strategy::B_transpose) {
                         Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
                     } else {
@@ -167,8 +168,8 @@ public:
                     unsigned int ymax = y + strat.out_height;
                     if (ymax > M) ymax = M;
 
-                    prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
-                    prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
+                    prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
+                    prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
                 }
             }
         }
author	Pablo Tello <pablo.tello@arm.com>	2017-11-02 16:09:35 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:35:24 +0000
commit	6ff12a0f7765f62b8d0fa8554021e1cac2789f19 (patch)
tree	9338db697789106b49ea391634be8b3c08ef9f97 /arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
parent	97988a4b3ef0f840432daf95b6e4b2ad7e5feefd (diff)
download	ComputeLibrary-6ff12a0f7765f62b8d0fa8554021e1cac2789f19.tar.gz