From cfac51c779f9bf05e8b2d386fbfb4022767d1d30 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Fri, 18 Jun 2021 15:47:28 +0100 Subject: Port NEGEMMLowp Part 2 Details: Extend NEConvertQuantizedSignednessKernel Port NEGEMMInterleave4x4Kernel to CpuGemmInterleave4x4Kernel Port NEGEMMTranspose1xWKernel to CpuGemmTranspose1xWKernel Port NEGEMMLowpMatrixAReductionKernel to CpuGemmLowpMatrixAReductionKernel Port NEGEMMLowpMatrixBReductionKernel to CpuGemmLowpMatrixBReductionKernel Port NEGEMMLowpOffsetContributionOutputStageKernel to CpuGemmLowpOffsetContributionOutputStageKernel Port NEGEMMLowpOffsetContributionKernel to CpuGemmLowpOffsetContributionKernel Resolves: COMPMID-4403 Change-Id: I3227f052f25e7b41d073bbea1da8a881fcd78b8e Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5875 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 20 ++- arm_compute/runtime/NEON/functions/NEQLSTMLayer.h | 139 +++++++++++---------- 2 files changed, 80 insertions(+), 79 deletions(-) (limited to 'arm_compute/runtime/NEON/functions') diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 60cfd8f91d..896ef60d6f 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -24,6 +24,7 @@ #ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" @@ -33,19 +34,14 @@ namespace arm_compute { class ITensor; -/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: - * - * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel - * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel - * -# @ref NEGEMMLowpMatrixMultiplyKernel - * -# @ref NEGEMMLowpOffsetContributionKernel - * -# @ref NEActivationLayer - * - * otherwise if the DOT product instruction is available: +class ITensorInfo; + +/** Function to run Gemm on quantized types. * - * -# @ref NEGEMMLowpOffsetContributionKernel + * This function calls the following: * -*/ + * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore + */ class NEGEMMLowpMatrixMultiplyCore : public IFunction { public: @@ -114,7 +110,7 @@ public: private: struct Impl; - std::unique_ptr _impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */ diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index 77adffd543..acbd92cff7 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -43,8 +43,13 @@ namespace arm_compute class ITensor; class ITensorInfo; class NEQLSTMLayerNormalizationKernel; -class NEGEMMLowpMatrixAReductionKernel; - +namespace cpu +{ +namespace kernels +{ +class CpuGemmLowpMatrixAReductionKernel; +} // namespace kernels +} // namespace cpu /** Basic function to run @ref NEQLSTMLayer * * This function calls the following kernels: @@ -55,7 +60,7 @@ class NEGEMMLowpMatrixAReductionKernel; * -# @ref NECopy Copy kernel for copying output_state_out to output * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 - * -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use + * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel For precomputing effective biases to use * -# @ref NEPixelWiseMultiplication Elementwise multiplication * -# @ref NETranspose Transpose function for reshaping the weights * */ @@ -250,70 +255,70 @@ private: }; // Functions used - NETranspose _transpose_input_to_forget_weights; - NETranspose _transpose_input_to_cell_weights; - NETranspose _transpose_input_to_output_weights; - NETranspose _transpose_input_to_input_weights; - NETranspose _transpose_recurrent_to_forget_weights; - NETranspose _transpose_recurrent_to_cell_weights; - NETranspose _transpose_recurrent_to_output_weights; - NETranspose _transpose_recurrent_to_input_weights; - NETranspose _transpose_projection_weights; - std::unique_ptr _input_to_input_reduction; - std::unique_ptr _recurrent_to_input_reduction; - std::unique_ptr _input_to_forget_reduction; - std::unique_ptr _recurrent_to_forget_reduction; - std::unique_ptr _input_to_cell_reduction; - std::unique_ptr _recurrent_to_cell_reduction; - std::unique_ptr _input_to_output_reduction; - std::unique_ptr _recurrent_to_output_reduction; - std::unique_ptr _projection_reduction; - NEArithmeticAddition _projection_bias_add; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget; - NEGEMMLowpOutputStage _input_to_forget_outstage; - NEGEMMLowpOutputStage _recurrent_to_forget_outstage; - NEGEMMLowpOutputStage _cell_to_forget_outstage; - NEArithmeticAddition _accumulate_input_recurrent_forget; - NEArithmeticAddition _accumulate_cell_forget; - NEActivationLayer _forget_gate_sigmoid; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell; - NEGEMMLowpOutputStage _input_to_cell_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell; - NEGEMMLowpOutputStage _recurrent_to_cell_outstage; - NEArithmeticAddition _accumulate_input_recurrent_modulation; - NEActivationLayer _cell_gate_tanh; - NEArithmeticSubtraction _input_gate_sub; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_input; - NEGEMMLowpOutputStage _input_to_input_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input; - NEGEMMLowpOutputStage _recurrent_to_input_outstage; - NEArithmeticAddition _accumulate_input_recurrent_input; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_input; - NEGEMMLowpOutputStage _cell_to_input_outstage; - NEArithmeticAddition _accumulate_cell_input; - NEActivationLayer _input_gate_sigmoid; - NEPixelWiseMultiplication _pixelwise_mul_forget_cell; - NEPixelWiseMultiplication _pixelwise_mul_input_cell; - NEArithmeticAddition _add_forget_cell; - NEActivationLayer _cell_clip; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_output; - NEGEMMLowpOutputStage _input_to_output_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output; - NEGEMMLowpOutputStage _recurrent_to_output_outstage; - NEArithmeticAddition _accumulate_input_recurrent_output; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_output; - NEGEMMLowpOutputStage _cell_to_output_outstage; - NEArithmeticAddition _accumulate_cell_to_output; - NEActivationLayer _output_gate_sigmoid; - NEActivationLayer _hidden_tanh; - NEPixelWiseMultiplication _pixelwise_mul_hidden; - NEGEMMLowpOutputStage _hidden_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_projection; - NEGEMMLowpOutputStage _projection_outstage; - NEArithmeticAddition _accumulate_projection; - NEActivationLayer _projection_clip; + NETranspose _transpose_input_to_forget_weights; + NETranspose _transpose_input_to_cell_weights; + NETranspose _transpose_input_to_output_weights; + NETranspose _transpose_input_to_input_weights; + NETranspose _transpose_recurrent_to_forget_weights; + NETranspose _transpose_recurrent_to_cell_weights; + NETranspose _transpose_recurrent_to_output_weights; + NETranspose _transpose_recurrent_to_input_weights; + NETranspose _transpose_projection_weights; + std::unique_ptr _input_to_input_reduction; + std::unique_ptr _recurrent_to_input_reduction; + std::unique_ptr _input_to_forget_reduction; + std::unique_ptr _recurrent_to_forget_reduction; + std::unique_ptr _input_to_cell_reduction; + std::unique_ptr _recurrent_to_cell_reduction; + std::unique_ptr _input_to_output_reduction; + std::unique_ptr _recurrent_to_output_reduction; + std::unique_ptr _projection_reduction; + NEArithmeticAddition _projection_bias_add; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget; + NEGEMMLowpOutputStage _input_to_forget_outstage; + NEGEMMLowpOutputStage _recurrent_to_forget_outstage; + NEGEMMLowpOutputStage _cell_to_forget_outstage; + NEArithmeticAddition _accumulate_input_recurrent_forget; + NEArithmeticAddition _accumulate_cell_forget; + NEActivationLayer _forget_gate_sigmoid; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell; + NEGEMMLowpOutputStage _input_to_cell_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell; + NEGEMMLowpOutputStage _recurrent_to_cell_outstage; + NEArithmeticAddition _accumulate_input_recurrent_modulation; + NEActivationLayer _cell_gate_tanh; + NEArithmeticSubtraction _input_gate_sub; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_input; + NEGEMMLowpOutputStage _input_to_input_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input; + NEGEMMLowpOutputStage _recurrent_to_input_outstage; + NEArithmeticAddition _accumulate_input_recurrent_input; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_input; + NEGEMMLowpOutputStage _cell_to_input_outstage; + NEArithmeticAddition _accumulate_cell_input; + NEActivationLayer _input_gate_sigmoid; + NEPixelWiseMultiplication _pixelwise_mul_forget_cell; + NEPixelWiseMultiplication _pixelwise_mul_input_cell; + NEArithmeticAddition _add_forget_cell; + NEActivationLayer _cell_clip; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_output; + NEGEMMLowpOutputStage _input_to_output_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output; + NEGEMMLowpOutputStage _recurrent_to_output_outstage; + NEArithmeticAddition _accumulate_input_recurrent_output; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_output; + NEGEMMLowpOutputStage _cell_to_output_outstage; + NEArithmeticAddition _accumulate_cell_to_output; + NEActivationLayer _output_gate_sigmoid; + NEActivationLayer _hidden_tanh; + NEPixelWiseMultiplication _pixelwise_mul_hidden; + NEGEMMLowpOutputStage _hidden_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_projection; + NEGEMMLowpOutputStage _projection_outstage; + NEArithmeticAddition _accumulate_projection; + NEActivationLayer _projection_clip; TensorCopyKernel _projection_bias_copy; TensorCopyKernel _projection_output_to_accumulate_copy; -- cgit v1.2.1