From bef7fa27b0d231a8649952f60808132d109b6345 Mon Sep 17 00:00:00 2001 From: Sang-Hoon Park Date: Wed, 21 Oct 2020 15:58:54 +0100 Subject: COMPMID-3639: (3RDPARTY_UPDATE) Move CL kernels to src Change-Id: I10d27db788e5086adae1841e3e2441cd9b76ef84 Signed-off-by: Sang-Hoon Park Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4310 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- arm_compute/runtime/CL/functions/CLQLSTMLayer.h | 167 +++++++++++------------- 1 file changed, 76 insertions(+), 91 deletions(-) (limited to 'arm_compute/runtime/CL/functions/CLQLSTMLayer.h') diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h index 6e537680ee..a8f9221b3d 100644 --- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h @@ -24,9 +24,6 @@ #ifndef ARM_COMPUTE_CLQLSTMLAYER_H #define ARM_COMPUTE_CLQLSTMLAYER_H -#include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" -#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" @@ -40,7 +37,12 @@ namespace arm_compute { // Forward declarations +class CLCompileContext; +class CLCopyKernel; class ICLTensor; +class CLGEMMLowpMatrixAReductionKernel; +class CLQLSTMLayerNormalizationKernel; +class ITensorInfo; /** Basic function to run @ref CLQLSTMLayer * @@ -68,6 +70,8 @@ public: CLQLSTMLayer &operator=(const CLQLSTMLayer &) = delete; /** Default move assignment operator */ CLQLSTMLayer &operator=(CLQLSTMLayer &&) = default; + /** Default destructor */ + ~CLQLSTMLayer(); /** Initialize function's tensors. * * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED. @@ -285,72 +289,72 @@ private: }; // Functions used - CLTranspose _transpose_input_to_forget_weights{}; - CLTranspose _transpose_input_to_cell_weights{}; - CLTranspose _transpose_input_to_output_weights{}; - CLTranspose _transpose_input_to_input_weights{}; - CLTranspose _transpose_recurrent_to_forget_weights{}; - CLTranspose _transpose_recurrent_to_cell_weights{}; - CLTranspose _transpose_recurrent_to_output_weights{}; - CLTranspose _transpose_recurrent_to_input_weights{}; - CLTranspose _transpose_projection_weights{}; - CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{}; - CLGEMMLowpMatrixAReductionKernel _projection_reduction{}; - CLArithmeticAddition _projection_bias_add{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; - CLGEMMLowpOutputStage _input_to_forget_outstage{}; - CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; - CLGEMMLowpOutputStage _cell_to_forget_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_forget{}; - CLArithmeticAddition _accumulate_cell_forget{}; - CLActivationLayer _forget_gate_sigmoid{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; - CLGEMMLowpOutputStage _input_to_cell_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; - CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_modulation{}; - CLActivationLayer _cell_gate_tanh{}; - CLArithmeticSubtraction _input_gate_sub{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; - CLGEMMLowpOutputStage _input_to_input_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; - CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_input{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; - CLGEMMLowpOutputStage _cell_to_input_outstage{}; - CLArithmeticAddition _accumulate_cell_input{}; - CLActivationLayer _input_gate_sigmoid{}; - CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; - CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; - CLArithmeticAddition _add_forget_cell{}; - CLActivationLayer _cell_clip{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; - CLGEMMLowpOutputStage _input_to_output_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; - CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_output{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; - CLGEMMLowpOutputStage _cell_to_output_outstage{}; - CLArithmeticAddition _accumulate_cell_to_output{}; - CLActivationLayer _output_gate_sigmoid{}; - CLActivationLayer _hidden_tanh{}; - CLPixelWiseMultiplication _pixelwise_mul_hidden{}; - CLGEMMLowpOutputStage _hidden_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_projection{}; - CLGEMMLowpOutputStage _projection_outstage{}; - CLArithmeticAddition _accumulate_projection{}; - CLActivationLayer _projection_clip{}; - std::array _layer_norms{ {} }; - CLCopyKernel _copy_output{}; + CLTranspose _transpose_input_to_forget_weights{}; + CLTranspose _transpose_input_to_cell_weights{}; + CLTranspose _transpose_input_to_output_weights{}; + CLTranspose _transpose_input_to_input_weights{}; + CLTranspose _transpose_recurrent_to_forget_weights{}; + CLTranspose _transpose_recurrent_to_cell_weights{}; + CLTranspose _transpose_recurrent_to_output_weights{}; + CLTranspose _transpose_recurrent_to_input_weights{}; + CLTranspose _transpose_projection_weights{}; + std::unique_ptr _input_to_input_reduction; + std::unique_ptr _recurrent_to_input_reduction; + std::unique_ptr _input_to_forget_reduction; + std::unique_ptr _recurrent_to_forget_reduction; + std::unique_ptr _input_to_cell_reduction; + std::unique_ptr _recurrent_to_cell_reduction; + std::unique_ptr _input_to_output_reduction; + std::unique_ptr _recurrent_to_output_reduction; + std::unique_ptr _projection_reduction; + CLArithmeticAddition _projection_bias_add{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; + CLGEMMLowpOutputStage _input_to_forget_outstage{}; + CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; + CLGEMMLowpOutputStage _cell_to_forget_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_forget{}; + CLArithmeticAddition _accumulate_cell_forget{}; + CLActivationLayer _forget_gate_sigmoid{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; + CLGEMMLowpOutputStage _input_to_cell_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; + CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_modulation{}; + CLActivationLayer _cell_gate_tanh{}; + CLArithmeticSubtraction _input_gate_sub{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; + CLGEMMLowpOutputStage _input_to_input_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; + CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_input{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; + CLGEMMLowpOutputStage _cell_to_input_outstage{}; + CLArithmeticAddition _accumulate_cell_input{}; + CLActivationLayer _input_gate_sigmoid{}; + CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; + CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; + CLArithmeticAddition _add_forget_cell{}; + CLActivationLayer _cell_clip{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; + CLGEMMLowpOutputStage _input_to_output_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; + CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_output{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; + CLGEMMLowpOutputStage _cell_to_output_outstage{}; + CLArithmeticAddition _accumulate_cell_to_output{}; + CLActivationLayer _output_gate_sigmoid{}; + CLActivationLayer _hidden_tanh{}; + CLPixelWiseMultiplication _pixelwise_mul_hidden{}; + CLGEMMLowpOutputStage _hidden_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_projection{}; + CLGEMMLowpOutputStage _projection_outstage{}; + CLArithmeticAddition _accumulate_projection{}; + CLActivationLayer _projection_clip{}; + std::array, _layer_norm_count> _layer_norms; + std::unique_ptr _copy_output; TensorCopyKernel _projection_bias_copy{}; TensorCopyKernel _projection_output_to_accumulate_copy{}; @@ -402,30 +406,11 @@ private: inline CLQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g) { - return _layer_norms[getGateIndex(g)]; + return *_layer_norms[getGateIndex(g)]; } - inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in) - { - ARM_COMPUTE_ERROR_ON(!_has_layer_norm); - - CLTensor *out = &get_layer_norm_output(g); - _memory_group.manage(out); - out->allocator()->init(*(in->info())); - - get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g)); - } - - inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias) - { - // Output quantization scale will be different, but ignored here - // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; - return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); - } + inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in); + inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias); // Temporary tensors CLTensor _input_to_forget_weights_transposed{ nullptr }; -- cgit v1.2.1