aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
diff options
context:
space:
mode:
authorDavid Mansell <David.Mansell@arm.com>2020-07-08 13:28:45 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-07-23 11:55:45 +0000
commit318c9f40770b2d1c06f8c0fe3f7929812503733e (patch)
tree58926bc05cd825d885cf9af9553b1309068f36b6 /src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
parent40aad9bbbae5308d7302e61e1372328c9b5daf99 (diff)
downloadComputeLibrary-318c9f40770b2d1c06f8c0fe3f7929812503733e.tar.gz
COMPMID-3578: Update FP32/int8 kernel selection.
Upgrade the current 'is_preferred()' mechanism with a new framework, where kernels instead provide an estimated cycle count figure. Compatibility with old mechanism is achieved via a wrapper which replaces a "true" result with an estimate of 0, and a "false" result with UINT64_MAX. This mechanism is then used to select between 'interleaved' and 'hybrid' FP32 NEON kernels. This uses a simple system based on counting MACs performed and bytes of data transferred (for rearrange/merge operations) and dividing by fixed performance figures, which are provided for A53, A55, A73 and 'default' figures (based on A76). Separately, a new route for performing int8 GEMMs by using the int16 kernel is provided. This performs significantly (for uint8) or slightly (for int8) better on A53 than the existing int8 route. Optimized 8-to-16 bit transforms are also included. Change-Id: I53b2e59eb9368793c78c2081e17d2445361bcc47 Signed-off-by: David Mansell <David.Mansell@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/250120 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3609 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp84
1 files changed, 57 insertions, 27 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 3b829491ca..c4dceef922 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -23,15 +23,14 @@
*/
#pragma once
-#include <stdio.h>
-#include <assert.h>
-
#include <algorithm>
+#include <cassert>
#include "arm_gemm.hpp"
#include "utils.hpp"
#include "mergeresults.hpp"
+#include "performance_parameters.hpp"
#include "transform.hpp"
#ifdef CYCLE_PROFILING
@@ -149,6 +148,33 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
}
+ static unsigned int get_k_block_size(const GemmArgs &args) {
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
+
+ const unsigned int L1_size = args._ci->get_L1_cache_size();
+ unsigned int k_block;
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ k_block /= strategy::k_unroll();
+ k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+
+ // So divide the space equally into that many blocks.
+ k_block = iceildiv(args._Ksize, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ k_block = roundup(k_block, strategy::k_unroll());
+
+ return k_block;
+ }
public:
GemmInterleaved(GemmInterleaved &) = delete;
@@ -158,35 +184,14 @@ public:
GemmInterleaved(const GemmArgs &args)
: _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
_nbatches(args._nbatches), _nmulti(args._nmulti),
- _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads) {
- const unsigned int L1_size = _ci->get_L1_cache_size();
+ _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+ _k_block(get_k_block_size(args)) {
const unsigned int L2_size = _ci->get_L2_cache_size();
assert(_maxthreads > 0);
// Work out blocking parameters, or override from provided GemmConfig
- if (args._cfg && args._cfg->inner_block_size) {
- _k_block = args._cfg->inner_block_size;
- } else {
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
- // Needs to be (at least a single) multiple of the K unroll level.
- _k_block /= strategy::k_unroll();
- _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
- // Now tune to presented problem size; this is how many blocks we need.
- unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
- // So divide the space equally into that many blocks.
- _k_block = iceildiv(_Ksize, num_k_blocks);
-
- // And round UP to the K unroll level required.
- _k_block = iceildiv(_k_block, strategy::k_unroll());
- _k_block *= strategy::k_unroll();
- }
-
+ // TODO: Move outer block into a static function too.
if (args._cfg && args._cfg->outer_block_size) {
_x_block = args._cfg->outer_block_size;
} else {
@@ -422,6 +427,31 @@ public:
void set_pretransposed_B_data(void *in_buffer) override {
_B_transposed = reinterpret_cast<Toi *>(in_buffer);
}
+
+ // Estimate cycles for given problem given provided parameters
+ static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+ unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+
+ uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+ uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+ uint64_t merge_bytes = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+ float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+ float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+ float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+ float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+ // We can't thread over multis or width, which makes this a poor
+ // choice in many threaded cases. Penalize that here.
+ float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
+
+ if (parallelism_available < args._maxthreads) {
+ total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+ }
+
+ return static_cast<uint64_t>(total_cycles);
+ }
};
} // namespace arm_gemm