aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
diff options
context:
space:
mode:
authorFrancesco.Petrogalli@arm.com <francesco.petrogalli@arm.com>2022-04-05 10:31:08 +0000
committerFrancesco Petrogalli <francesco.petrogalli@arm.com>2022-05-24 14:28:27 +0000
commit5fcf22dadf092efd7aafb359f9229aa270eb1129 (patch)
treef309426ed19bd6710329da3b530167db72d1c6b2 /src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
parenta8caa023f0d7b71b3a250a14ceee935052fcc74a (diff)
downloadComputeLibrary-5fcf22dadf092efd7aafb359f9229aa270eb1129.tar.gz
[arm_gemm] Import fixed-format kernels from gemm_linux.
This is a No Functional Change Intended (NFCI) patch. It imports the kernel in the code, but the interface to select them and expose the format of the weight tensors to the user will be provided in a subsequent patch. Kernels and kernel selection code in arm_gemm has been provided by David.Mansell <David.Mansell@arm.com>. The kernels are not compiled in the library by default, but need to be selected via the `scons` option `experimental_fixed_format_kernels=1`. Resolves: ONCPUML-829 Signed-off-by: Francesco.Petrogalli@arm.com <francesco.petrogalli@arm.com> Change-Id: If00ccb2b9b7221e01b214cf9783111226ccc8bf4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7380 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp140
1 files changed, 116 insertions, 24 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 5b3ef4203d..c41b0a5b3e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -33,6 +33,7 @@
#include "arm_gemm.hpp"
#include "bias_adder.hpp"
#include "convolver.hpp"
+#include "kernel_weight_format.hpp"
#include "ndrange.hpp"
#include "performance_parameters.hpp"
#include "transform.hpp"
@@ -54,7 +55,7 @@ namespace {
// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
// that.
-template<typename OutputStage, bool SeparateQuantize = false>
+template<typename OutputStage, bool SeparateQuantize, bool FixedFormat>
class run_hybrid_kernel {
public:
template<typename strategy, typename Tlo, typename Tro, typename Tr>
@@ -63,18 +64,18 @@ public:
profiler &prof,
#endif
const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
};
template<>
template<typename strategy, typename Tlo, typename Tro, typename Tr>
-inline void run_hybrid_kernel<Nothing, false>::run(
+inline void run_hybrid_kernel<Nothing, false, false>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
const Nothing &, const int32_t *, unsigned int) {
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
@@ -115,12 +116,60 @@ inline void run_hybrid_kernel<Nothing, false>::run(
template<>
template<typename strategy, typename Tlo, typename Tro, typename Tr>
-inline void run_hybrid_kernel<Requantize32, false>::run(
+inline void run_hybrid_kernel<Nothing, false, true>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+ unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+ UNUSED(kern_k);
+
+ /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
+ * a partial block and pad the bias for that block. */
+ if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
+ /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
+ unsigned int N_remainder = N % strategy::out_width();
+ unsigned int N_bulk = N - N_remainder;
+
+ /* Output argument to be used for the tail */
+ IndirectOutputArg<Tr> offset_output = output_arg;
+
+ /* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */
+ if (N_bulk > 0) {
+ strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+
+ if (output_arg.is_indirect) {
+ offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
+ } else {
+ offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
+ }
+ }
+
+ /* Pad the bias buffer for the remainder */
+ Tr *bias_pad_buffer = reinterpret_cast<Tr *>(alloca(strategy::out_width() * sizeof(Tr)));
+ memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr));
+
+ /* Process the remainder, offsetting the B pointer as needed. */
+ strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder,
+ b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output,
+ bias_pad_buffer, act, accumulate);
+ } else {
+ strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+ }
+}
+
+template<>
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, false, false>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
@@ -132,12 +181,12 @@ inline void run_hybrid_kernel<Requantize32, false>::run(
template<>
template<typename strategy, typename Tlo, typename Tro, typename Tr>
-inline void run_hybrid_kernel<Requantize32, true>::run(
+inline void run_hybrid_kernel<Requantize32, true, false>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+ unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
UNUSED(kern_k);
// On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
@@ -180,10 +229,38 @@ inline void run_hybrid_kernel<Requantize32, true>::run(
}
}
+template<typename strategy, bool FixedFormat>
+struct stripe_width {
+ static unsigned int get() {
+ return strategy::stripe_width();
+ }
+};
+
+template<typename strategy>
+struct stripe_width<strategy, false> {
+ static unsigned int get() {
+ return 0;
+ }
+};
+
+template<typename strategy, bool FixedFormat>
+struct kernel_weight_format {
+ static KernelWeightFormat get() {
+ return strategy::kernel_weight_format();
+ }
+};
+
+template<typename strategy>
+struct kernel_weight_format<strategy, false> {
+ static KernelWeightFormat get() {
+ return KernelWeightFormat::NON_FIXED;
+ }
+};
+
} // anonymous namespace
// Implementation of the GemmCommon abstract class.
-template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false>
class GemmHybridIndirect : public GemmCommon<To, Tr> {
typedef typename strategy::lhs_operand_type Tloi;
typedef typename strategy::rhs_operand_type Troi;
@@ -425,24 +502,32 @@ public:
const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize);
const unsigned int multi = p.dim(3);
- const Troi *b_panel = _B_transposed +
- (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
- (k0 * roundup(_args._Nsize, strategy::out_width())) +
- (n0 * kern_k);
+ const Troi *b_panel;
+ if (FixedFormat) {
+ b_panel = reinterpret_cast<const Troi *>(this->_Bptr) +
+ (multi * this->_B_multi_stride) +
+ ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+ (k0 * stripe_width<strategy, FixedFormat>::get());
+ } else {
+ b_panel = _B_transposed +
+ (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+ (k0 * roundup(_args._Nsize, strategy::out_width())) +
+ (n0 * kern_k);
+ }
- IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+ IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
#endif
if (_indirect_buf) {
- run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+ run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
#ifdef CYCLE_PROFILING
prof,
#endif
strat, sections, string_lengths.data(),
IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
- (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
last_pass ? _args._act : Activation(),
!first_pass,
@@ -469,13 +554,13 @@ public:
}
assert(pos == sections);
- run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+ run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
#ifdef CYCLE_PROFILING
prof,
#endif
strat, sections, string_lengths.data(),
IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
- (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
last_pass ? _args._act : Activation(),
!first_pass,
@@ -485,13 +570,13 @@ public:
// Length to process. This needs to exclude padding, but 'kmax' potentially includes it.
const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
- run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+ run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
#ifdef CYCLE_PROFILING
prof,
#endif
strat, 1, &len,
IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
- (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
last_pass ? _args._act : Activation(),
!first_pass,
@@ -504,14 +589,18 @@ public:
// Interface implementation - pretransposed
bool B_is_pretransposed() const override {
- return true;
+ return (FixedFormat == false);
}
bool B_pretranspose_required() const override {
- return (_B_transposed==nullptr);
+ return (FixedFormat == false) && (_B_transposed==nullptr);
}
size_t get_B_pretransposed_array_size() const override {
+ if (FixedFormat) {
+ return 0;
+ }
+
// Start with actual pretransposed buffer...
size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi);
@@ -599,8 +688,7 @@ public:
}
}
} else {
- // In the single K section case, can process the whole lot in one go.
- // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize.
+ // In the single K section case, can process the whole lot in one go.
strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
0, _args._Nsize, k0, std::min(kmax, _args._Ksize));
buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll());
@@ -694,11 +782,15 @@ public:
c.inner_block_size = _k_block;
c.outer_block_size = _n_block;
c.filter = get_type_name<strategy>();
+ c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To));
return c;
}
};
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>;
+
} // namespace arm_gemm
#ifdef __I_DEFINED_UNUSED