diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 29 |
1 files changed, 24 insertions, 5 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index d702cffce1..a6c9677305 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -74,7 +74,7 @@ class GemmHybrid : public GemmCommon<To, Tr> { } if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this. @@ -97,7 +97,13 @@ class GemmHybrid : public GemmCommon<To, Tr> { // single block. static unsigned int compute_n_block(const GemmArgs &args) { if (args._cfg && args._cfg->outer_block_size) { - return args._cfg->outer_block_size; + unsigned int n_block = args._cfg->outer_block_size; + + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1u) * strategy::out_width(); + + return n_block; } if (args._Nsize <= 64) { @@ -215,7 +221,9 @@ public: return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi); } - void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override { + assert(!transposed); + Toi *buffer = reinterpret_cast<Toi *>(in_buffer); _B_transposed = buffer; strategy strat(_ci); @@ -231,7 +239,7 @@ public: const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size; strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb, - x0, xmax, k0, kmax); + x0, xmax, k0, kmax, false); buffer += size; } @@ -264,6 +272,17 @@ public: return total_cycles; } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_HYBRID; + c.inner_block_size = _k_block; + c.outer_block_size = _n_block; + c.filter = get_type_name<strategy>(); + + return c; + } }; } // namespace arm_gemm |