diff options
author | Pablo Tello <pablo.tello@arm.com> | 2017-11-02 16:09:35 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:35:24 +0000 |
commit | 6ff12a0f7765f62b8d0fa8554021e1cac2789f19 (patch) | |
tree | 9338db697789106b49ea391634be8b3c08ef9f97 /arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp | |
parent | 97988a4b3ef0f840432daf95b6e4b2ad7e5feefd (diff) | |
download | ComputeLibrary-6ff12a0f7765f62b8d0fa8554021e1cac2789f19.tar.gz |
COMPMID-662: Integrated the new a64_s8_gemm_12x8 + dot product kernel into ACL.
Change-Id: Id8f919e486a132fc58346c9f84fccbeeb83d19b3
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94233
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp')
-rw-r--r-- | arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp | 13 |
1 files changed, 7 insertions, 6 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp index a186d88355..659ef837f5 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp @@ -24,6 +24,7 @@ #pragma once #include <stdio.h> +#include <cassert> #include "gemm_common.hpp" #include "profiler.hpp" @@ -114,12 +115,13 @@ public: // Work out the rounded size of M - needed for some buffers. Mround = (M + (strat.out_height - 1)) / strat.out_height; Mround *= strat.out_height; + } // Actually execute the GEMM. void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { + assert(working_space); profiler prof; - int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes); size_t diff = 0; @@ -128,7 +130,6 @@ public: diff = 0x10 - (working_space_int & 0xF); } - // TODO: Multithreaded implementations could share the burden of transforming these blocks. Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff); Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff); Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff); @@ -141,7 +142,7 @@ public: int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll; kern_k *= strat.k_unroll; - prof(PROFILE_PREPA, [&](void) { + prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) { if (trA ^ strategy::A_transpose) { Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax); } else { @@ -155,7 +156,7 @@ public: int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width; - prof(PROFILE_PREPB, [&](void) { + prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) { if (trB ^ strategy::B_transpose) { Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax); } else { @@ -167,8 +168,8 @@ public: unsigned int ymax = y + strat.out_height; if (ymax > M) ymax = M; - prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); - prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); }); + prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); + prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); }); } } } |