aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp108
1 files changed, 105 insertions, 3 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index d8b464584a..ae344f09b5 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -29,7 +29,6 @@
#include "arm_gemm.hpp"
#include "bfloat.hpp"
#include "convolver.hpp"
-#include "kernel_weight_format.hpp"
#include "kernel_traits.hpp"
#include "kernel_weight_format.hpp"
#include "mergeresults.hpp"
@@ -247,6 +246,84 @@ void kernel_and_merge<true, false, Requantize32>::run(
}
}
+// Run a kernel with integrated merge, dequantizing to FP32
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, false, DequantizeFloat>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+ unsigned int n_0, unsigned int n_max, const Tr *bias,
+ const Activation &act, bool accumulate, const DequantizeFloat &dq, const int32_t *col_bias,
+ Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+ const int32_t *offset_col_bias = nullptr;
+ const Tr *offset_bias = nullptr;
+
+ if (col_bias) {
+ offset_col_bias = col_bias + n_0;
+ }
+
+ if (bias) {
+ offset_bias = bias + n_0;
+ }
+
+ strat.kernel(// A and B pointers are just the packed panels.
+ a_ptr, b_panel,
+ // Provide relevant part of output array and row stride.
+ c_ptr ? (c_ptr + m_0 * ldc + n_0) : nullptr, ldc,
+ // M, N, K sizes
+ m_max-m_0, n_max - n_0, kern_k,
+ // Bias, activation, accumulation. Need to offset the bias as needed.
+ offset_col_bias, dq, offset_bias, act, accumulate, acc_buff);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, false, DequantizeFloat>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+ unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
+ const Activation &act, bool accumulate, const DequantizeFloat &qp, const int32_t *,
+ Tab *)
+{
+ const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+ strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+ }
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+ auto out_area = strategy::out_width() * strategy::out_height();
+ for (int i=0; i<bblocks; i++) {
+ const unsigned int n_start = n_0 + (strategy::out_width() * i);
+ const unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
+
+ dequantize_block_32(qp, (n_end - n_start), (m_max - m_0),
+ c_panel + (i * out_area), strategy::out_width(),
+ c_ptr + m_0 * ldc + n_start, ldc,
+ bias != nullptr ? bias + n_start : nullptr, accumulate, act);
+
+ }
+ }
+}
+
// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
// "requantizing" context where the output will be requantized.
//
@@ -280,6 +357,12 @@ public:
typedef int32_t type;
};
+template<typename strategy>
+class accumulate_buffer_type<strategy, DequantizeFloat, false> {
+public:
+ typedef int32_t type;
+};
+
template<typename strategy, typename OutputStage>
class accumulate_buffer_type<strategy, OutputStage, true> {
public:
@@ -764,6 +847,9 @@ public:
const bool first_pass = (k0==0);
const bool last_pass = (kmax==_Ktotal);
+ // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass.
+ const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass;
+
// Figure out how many "K" the kernel will actually process.
unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
@@ -822,7 +908,7 @@ public:
// K size, and M/N ranges
kern_k, start_row, end_row, start_x, end_x,
// Only do bias on the first pass
- ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
+ ((bias_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
// Only do activation on the last pass, and accumulation on any non-first pass.
(last_pass ? _act : Activation()), (!first_pass || _accumulate),
// Pass in quantization parameters for requantizing kernels (others will ignore)
@@ -949,6 +1035,9 @@ public:
const bool first_pass = (current.k0() == 0);
const bool last_pass = (current.kmax() == _Ktotal);
+ // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass.
+ const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass;
+
// Pointer to appropriate part of result array.
Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
@@ -970,7 +1059,7 @@ public:
// K size, and M/N ranges
kern_k, y, ymax, current.x0(), current.xmax(),
// Only do bias on the first pass
- ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
+ ((bias_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
// Only do activation on the last pass, and accumulation on any non-first pass.
(last_pass ? _act : Activation()), (!first_pass || _accumulate),
// Pass in quantization parameters for requantizing kernels (others will ignore)
@@ -1185,6 +1274,13 @@ public:
}
}
+ void set_dequantize_scale(const float scale) override {
+ if(std::is_same<OutputStage, DequantizeFloat>::value) {
+ DequantizeFloat* df = reinterpret_cast<DequantizeFloat *>(&_os);
+ df->scale = scale;
+ }
+ }
+
void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
assert(string_len == _Ksize);
_indirect_buf = ptr;
@@ -1249,4 +1345,10 @@ using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strat
template<typename strategy, typename To, typename Tr>
using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedNoMergeDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat>;
+
} // namespace arm_gemm