diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp | 108 |
1 files changed, 105 insertions, 3 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index d8b464584a..ae344f09b5 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -29,7 +29,6 @@ #include "arm_gemm.hpp" #include "bfloat.hpp" #include "convolver.hpp" -#include "kernel_weight_format.hpp" #include "kernel_traits.hpp" #include "kernel_weight_format.hpp" #include "mergeresults.hpp" @@ -247,6 +246,84 @@ void kernel_and_merge<true, false, Requantize32>::run( } } +// Run a kernel with integrated merge, dequantizing to FP32 +template<> +template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> +void kernel_and_merge<false, false, DequantizeFloat>::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, + unsigned int n_0, unsigned int n_max, const Tr *bias, + const Activation &act, bool accumulate, const DequantizeFloat &dq, const int32_t *col_bias, + Tab *acc_buff) +{ +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k); +#endif + + const int32_t *offset_col_bias = nullptr; + const Tr *offset_bias = nullptr; + + if (col_bias) { + offset_col_bias = col_bias + n_0; + } + + if (bias) { + offset_bias = bias + n_0; + } + + strat.kernel(// A and B pointers are just the packed panels. + a_ptr, b_panel, + // Provide relevant part of output array and row stride. + c_ptr ? (c_ptr + m_0 * ldc + n_0) : nullptr, ldc, + // M, N, K sizes + m_max-m_0, n_max - n_0, kern_k, + // Bias, activation, accumulation. Need to offset the bias as needed. + offset_col_bias, dq, offset_bias, act, accumulate, acc_buff); +} + +template<> +template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> +void kernel_and_merge<true, false, DequantizeFloat>::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, + unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias, + const Activation &act, bool accumulate, const DequantizeFloat &qp, const int32_t *, + Tab *) +{ + const int bblocks = iceildiv(n_max - n_0, strategy::out_width()); + + { +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k)); +#endif + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + } + + { +#ifdef CYCLE_PROFILING + auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr))); +#endif + auto out_area = strategy::out_width() * strategy::out_height(); + for (int i=0; i<bblocks; i++) { + const unsigned int n_start = n_0 + (strategy::out_width() * i); + const unsigned int n_end = std::min(n_start + strategy::out_width(), n_max); + + dequantize_block_32(qp, (n_end - n_start), (m_max - m_0), + c_panel + (i * out_area), strategy::out_width(), + c_ptr + m_0 * ldc + n_start, ldc, + bias != nullptr ? bias + n_start : nullptr, accumulate, act); + + } + } +} + // Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in // "requantizing" context where the output will be requantized. // @@ -280,6 +357,12 @@ public: typedef int32_t type; }; +template<typename strategy> +class accumulate_buffer_type<strategy, DequantizeFloat, false> { +public: + typedef int32_t type; +}; + template<typename strategy, typename OutputStage> class accumulate_buffer_type<strategy, OutputStage, true> { public: @@ -764,6 +847,9 @@ public: const bool first_pass = (k0==0); const bool last_pass = (kmax==_Ktotal); + // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass. + const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass; + // Figure out how many "K" the kernel will actually process. unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll()); @@ -822,7 +908,7 @@ public: // K size, and M/N ranges kern_k, start_row, end_row, start_x, end_x, // Only do bias on the first pass - ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr), + ((bias_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr), // Only do activation on the last pass, and accumulation on any non-first pass. (last_pass ? _act : Activation()), (!first_pass || _accumulate), // Pass in quantization parameters for requantizing kernels (others will ignore) @@ -949,6 +1035,9 @@ public: const bool first_pass = (current.k0() == 0); const bool last_pass = (current.kmax() == _Ktotal); + // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass. + const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass; + // Pointer to appropriate part of result array. Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride); @@ -970,7 +1059,7 @@ public: // K size, and M/N ranges kern_k, y, ymax, current.x0(), current.xmax(), // Only do bias on the first pass - ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr), + ((bias_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr), // Only do activation on the last pass, and accumulation on any non-first pass. (last_pass ? _act : Activation()), (!first_pass || _accumulate), // Pass in quantization parameters for requantizing kernels (others will ignore) @@ -1185,6 +1274,13 @@ public: } } + void set_dequantize_scale(const float scale) override { + if(std::is_same<OutputStage, DequantizeFloat>::value) { + DequantizeFloat* df = reinterpret_cast<DequantizeFloat *>(&_os); + df->scale = scale; + } + } + void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override { assert(string_len == _Ksize); _indirect_buf = ptr; @@ -1249,4 +1345,10 @@ using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strat template<typename strategy, typename To, typename Tr> using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>; +template<typename strategy, typename To, typename Tr> +using GemmInterleavedNoMergeDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat, false>; + +template<typename strategy, typename To, typename Tr> +using GemmInterleavedDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat>; + } // namespace arm_gemm |