From c0b6f76561580414f08633a804fc548ccad65659 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 2 Nov 2020 01:37:17 +0000 Subject: COMPMID-3776: Indirect GEMM Signed-off-by: Georgios Pinitas Change-Id: I51a1b0f098bc3a8c408c50c92221e4df3061e12c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4343 Tested-by: Arm Jenkins Reviewed-by: Sang-Hoon Park Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 52 +++++++++++--------------- 1 file changed, 22 insertions(+), 30 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index 7a983ed6ac..d702cffce1 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -77,51 +77,43 @@ class GemmHybrid : public GemmCommon { return args._cfg->inner_block_size; } - const unsigned int L1_size = args._ci->get_L1_cache_size(); + // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this. + unsigned int target_block_size = 2048 / sizeof(To); - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + if (args._Ksize >= ((3 * target_block_size) / 2)) { + unsigned int target_blocks = iceildiv(args._Ksize, target_block_size); - // Needs to be (at least a single) multiple of the K unroll level. - k_block /= strategy::k_unroll(); - k_block = std::max(k_block, 1U) * strategy::k_unroll(); + unsigned int block_size = iceildiv(args._Ksize, target_blocks); - // Now tune to presented problem size; this is how many blocks we need. - unsigned int numk_blocks = iceildiv(args._Ksize, k_block); + block_size = roundup(block_size, strategy::k_unroll()); - // So divide the space equally into that many blocks. - k_block = iceildiv(args._Ksize, numk_blocks); - - // And round UP to the K unroll level required. - k_block = roundup(k_block, strategy::k_unroll()); + return block_size; + } - return k_block; + return args._Ksize; } + // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a + // single block. static unsigned int compute_n_block(const GemmArgs &args) { if (args._cfg && args._cfg->outer_block_size) { return args._cfg->outer_block_size; } - const unsigned int k_block = compute_k_block(args); - const unsigned int L2_size = args._ci->get_L2_cache_size(); - - // n_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * k_block); + if (args._Nsize <= 64) { + return args._Nsize; + } - // Needs to be (at least a single) multiple of the kernel output width. - n_block /= strategy::out_width(); - n_block = std::max(n_block, 1U) * strategy::out_width(); + if ((args._Msize / args._Nsize) > 155) { + return args._Nsize; + } - // And tune to the presented problem size. - unsigned int numblocks = iceildiv(args._Nsize, n_block); - n_block = iceildiv(args._Nsize, numblocks); - n_block = roundup(n_block, strategy::out_width()); + // Go slightly wider if thread count and depth are small. + if ((args._Ksize <= 128) && (args._maxthreads <= 16)) { + return strategy::out_width() * 3; + } - return n_block; + return strategy::out_width(); } public: -- cgit v1.2.1