From 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 16 Jul 2021 16:16:43 +0100 Subject: Update GEMM assembly kernels - Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 94 +++++++++++++--------- 1 file changed, 58 insertions(+), 36 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 7f870b83d7..5639cb4182 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -192,7 +192,7 @@ void kernel_and_merge::run( { #ifdef CYCLE_PROFILING - auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr))); + auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr))); #endif // The interleaved kernel outputs in blocks - each block is a // row-major matrix of size out_width * out_height. The merge @@ -496,7 +496,7 @@ class GemmInterleaved : public GemmCommon { static unsigned int get_k_block_size(const GemmArgs &args) { if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } // K blocking not supported if we are requantizing. @@ -947,47 +947,55 @@ public: /* Figure out the size of each block. */ unsigned int k_size = (current.kmax() - current.k0()); - // We need to insert padding at the end of each K section. - // The computation needed is a little delicate - the coordinates from the block walker are expressed in - // terms of the full, padded, _Ktotal. - // But we need to transform each section with reference to the original, unpadded, input, letting the - // transform pad each section as needed. + if (_Ksections > 1) { + // We need to insert padding at the end of each K section. + // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // terms of the full, padded, _Ktotal. + // But we need to transform each section with reference to the original, unpadded, input, letting the + // transform pad each section as needed. - // This is needed for computations below. - const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll()); + // This is needed for computations below. + const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll()); - // The expected output format is also an entire columns interleaved, then the next set of - // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at - // a time. - for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){ - unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax()); + // The expected output format is also an entire columns interleaved, then the next set of + // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at + // a time. + for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) { + unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax()); - // Track where we are and how much work is left. - unsigned int kpos = current.k0(); - unsigned int kleft = k_size; + // Track where we are and how much work is left. + unsigned int kpos = current.k0(); + unsigned int kleft = k_size; - while (kleft) { - // Which section are we in? Based on the rounded-up section size. - unsigned int k_section_base = kpos / rounded_section_size; - // How far into the section are we? - unsigned int k_offset = kpos - (k_section_base * rounded_section_size); + while (kleft) { + // Which section are we in? Based on the rounded-up section size. + unsigned int k_section_base = kpos / rounded_section_size; + // How far into the section are we? + unsigned int k_offset = kpos - (k_section_base * rounded_section_size); - // We will either copy the rest of this section, or to the end of the requested length. - unsigned int k_length = std::min(_Ksize - k_offset, kleft); + // We will either copy the rest of this section, or to the end of the requested length. + unsigned int k_length = std::min(_Ksize - k_offset, kleft); - strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, - x0, xmax, - (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. - (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + x0, xmax, + (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. + (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. - // We need to modify our position based on the ROUNDED version of what we just did. - unsigned int padded_length = roundup(k_length, strategy::k_unroll()); + // We need to modify our position based on the ROUNDED version of what we just did. + unsigned int padded_length = roundup(k_length, strategy::k_unroll()); - buffer += strategy::out_width() * padded_length; + buffer += strategy::out_width() * padded_length; - kpos += padded_length; - kleft -= padded_length; + kpos += padded_length; + kleft -= padded_length; + } } + } else { + // In the single K section case, can process the whole lot in one go. + // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize. + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize)); + buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll()); } } while (current.advance()); } @@ -1019,12 +1027,15 @@ public: } // Estimate cycles for given problem given provided parameters - static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { + template + static uint64_t estimate_cycles(const GemmArgs &args) { unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args)); + const PerformanceParameters ¶ms = strategy::template get_performance_parameters(args._ci); + uint64_t total_macs = static_cast(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args); uint64_t prepare_bytes = static_cast(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi); - uint64_t merge_bytes = static_cast(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); + uint64_t merge_bytes = static_cast(args._nbatches) * args._nmulti * k_blocks * args._Msize * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); float mac_cycles = static_cast(total_macs) / params.kernel_macs_cycle; float prepare_cycles = static_cast(prepare_bytes) / params.prepare_bytes_cycle; @@ -1042,6 +1053,17 @@ public: return static_cast(total_cycles); } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_INTERLEAVED; + c.inner_block_size = _k_block; + c.outer_block_size = _x_block; + c.filter = get_type_name(); + + return c; + } }; // Aliases for the variations -- cgit v1.2.1