From eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Fri, 23 Feb 2018 13:43:50 +0000 Subject: COMPMID-881: RSH new arm_gemm interface. Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../NEON/kernels/assembly/gemm_interleaved.hpp | 177 --------------------- 1 file changed, 177 deletions(-) delete mode 100644 arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp (limited to 'arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp') diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp deleted file mode 100644 index 659ef837f5..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include -#include - -#include "gemm_common.hpp" -#include "profiler.hpp" -#include "transform.hpp" -#include "mergeresults.hpp" - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This implementation interleaves the source matrices in blocks - good for -// larger matrices. -template -class GemmInterleaved : public GemmCommon { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - const unsigned int M; - const unsigned int N; - const unsigned int K; - - const bool trA; - const bool trB; - - const strategy strat; - - unsigned int k_block = 0; - unsigned int x_block = 0; - unsigned int Mround = 0; - - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * k_block * Mround); - } - - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * x_block * k_block); - } - - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * x_block * strat.out_height); - } - -public: - size_t get_working_size() const override { - return get_a_working_size() + get_b_working_size() + get_c_working_size(); - } - - GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) { - const unsigned int L1_size = ci->L1_size; - const unsigned int L2_size = ci->L2_size; - - // Work out blocking parameters - // k_block: Each iteration will consume (out_width + out_height) - // operands - so how many iterations will fill the L1? - k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height)); - - // Needs to be a multiple of the K unroll level. - k_block /= strat.k_unroll; - k_block *= strat.k_unroll; - - // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = (K + (k_block - 1)) / k_block; - - // So divide the space equally into that many blocks. - k_block = (K + num_k_blocks - 1) / num_k_blocks; - - // And round UP to the K unroll level required. - k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll; - k_block *= strat.k_unroll; - - // x_block: Work out how many rows (of length k_block) will fit in the L2 - x_block = L2_size / (sizeof(Toi) * k_block); - - // Needs to be a multiple of the kernel output width. - x_block /= strat.out_width; - x_block *= strat.out_width; - - // And tune to the presented problem size. - int num_x_blocks = (N + (x_block - 1)) / x_block; - x_block = (N + num_x_blocks - 1) / num_x_blocks; - - x_block = (x_block + strat.out_width - 1) / strat.out_width; - x_block *= strat.out_width; - - // Work out the rounded size of M - needed for some buffers. - Mround = (M + (strat.out_height - 1)) / strat.out_height; - Mround *= strat.out_height; - - } - - // Actually execute the GEMM. - void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { - assert(working_space); - profiler prof; - int8_t *working_space_bytes = reinterpret_cast(working_space); - intptr_t working_space_int = reinterpret_cast(working_space_bytes); - size_t diff = 0; - - if (working_space_int & 0xF) { - diff = 0x10 - (working_space_int & 0xF); - } - - Toi * const a_panel = reinterpret_cast(working_space_bytes + diff); - Toi * const b_panel = reinterpret_cast(working_space_bytes + get_a_working_size() + diff); - Tri * const c_panel = reinterpret_cast(working_space_bytes + get_a_working_size() + get_b_working_size() + diff); - - for (unsigned int k0=0; k0 K) kmax = K; - - // Figure out how many "K" the kernel will actually process. - int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll; - kern_k *= strat.k_unroll; - - prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) { - if (trA ^ strategy::A_transpose) { - Transform(a_panel, A, lda, 0, M, k0, kmax); - } else { - Transform(a_panel, A, lda, 0, M, k0, kmax); - } - }); - - for (unsigned int x0=0; x0 N) xmax = N; - - int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width; - - prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) { - if (trB ^ strategy::B_transpose) { - Transform(b_panel, B, ldb, x0, xmax, k0, kmax); - } else { - Transform(b_panel, B, ldb, x0, xmax, k0, kmax); - } - }); - - for (unsigned int y=0; y M) ymax = M; - - prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); - prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast(1))); }); - } - } - } - } -}; -- cgit v1.2.1