From 173ba9bbb19ea83f951318d9989e440768b4de8f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 23 Jun 2020 17:25:43 +0100 Subject: COMPMID-3373: Async support to NEArithmetic* kernels/functions (Pt. 1) Added support on NEArithmeticAddition and NEArithmeticSubtraction Signed-off-by: Michalis Spyrou Change-Id: Ifa805f8455ef6eff1ee627752dc1c7fe9740ec47 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3451 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../core/NEON/kernels/NEArithmeticAdditionKernel.h | 17 +++-- .../NEON/kernels/NEArithmeticSubtractionKernel.h | 11 ++-- .../runtime/NEON/functions/NEArithmeticAddition.h | 72 +++++++++++++++++++++- .../NEON/functions/NEArithmeticSubtraction.h | 65 ++++++++++++++++++- arm_compute/runtime/NEON/functions/NEGEMM.h | 4 +- arm_compute/runtime/NEON/functions/NELSTMLayer.h | 18 +++--- arm_compute/runtime/NEON/functions/NEQLSTMLayer.h | 38 +++++++----- arm_compute/runtime/NEON/functions/NERNNLayer.h | 22 +++---- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 26 ++++---- .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 44 ++++++------- .../NEON/functions/NEArithmeticAddition.cpp | 46 +++++++++++++- .../NEON/functions/NEArithmeticSubtraction.cpp | 47 +++++++++++++- src/runtime/NEON/functions/NEGEMM.cpp | 8 +-- src/runtime/NEON/functions/NELSTMLayer.cpp | 16 ++--- src/runtime/NEON/functions/NEQLSTMLayer.cpp | 45 +++++++------- src/runtime/NEON/functions/NERNNLayer.cpp | 8 +-- 16 files changed, 347 insertions(+), 140 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index bff34dfda2..f254027e0e 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -68,12 +68,12 @@ public: * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED * - (QSYMM16,QSYMM16) -> QSYMM16 * - * @param[in] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[out] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. * @param[in] policy Overflow policy. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel * * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 @@ -86,7 +86,7 @@ public: static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override; private: /** Common signature for all the specialised add functions @@ -99,11 +99,8 @@ private: */ using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window); /** Add function to use for the particular tensor types passed to configure() */ - AddFunction *_func; - const ITensor *_input1; - const ITensor *_input2; - ITensor *_output; - ConvertPolicy _policy; + AddFunction *_func; + ConvertPolicy _policy; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H */ diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h index f75c6bfb98..dfd08d9b06 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -71,7 +71,7 @@ public: * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32. * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel * * @note Convert policy cannot be WRAP if datatype is QASYMM8 @@ -86,7 +86,7 @@ public: static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override; private: /** Common signature for all the specialised sub functions @@ -99,11 +99,8 @@ private: */ using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat); /** Sub function to use for the particular tensor types passed to configure() */ - SubFunction *_func; - const ITensor *_input1; - const ITensor *_input2; - ITensor *_output; - ConvertPolicy _policy; + SubFunction *_func; + ConvertPolicy _policy; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */ diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h index 2bf12df4df..589e0624eb 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h @@ -25,16 +25,75 @@ #define ARM_COMPUTE_NEARITHMETICADDITION_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/NEON/INEOperator.h" namespace arm_compute { class ITensor; +namespace experimental +{ /** Basic function to run @ref NEArithmeticAdditionKernel */ -class NEArithmeticAddition : public INESimpleFunctionNoBorder +class NEArithmeticAddition : public INEOperator { public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[out] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] output Output tensor info. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] policy Policy to use to handle overflow + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + MemoryRequirements workspace() const override; +}; +} // namespace experimental + +/** Basic function to run @ref NEArithmeticAdditionKernel */ +class NEArithmeticAddition : public IFunction +{ +public: + /** Default Constructor */ + NEArithmeticAddition(); + /** Default Destructor */ + ~NEArithmeticAddition(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticAddition(const NEArithmeticAddition &) = delete; + /** Default move constructor */ + NEArithmeticAddition(NEArithmeticAddition &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticAddition &operator=(const NEArithmeticAddition &) = delete; + /** Default move assignment operator */ + NEArithmeticAddition &operator=(NEArithmeticAddition &&); /** Initialise the kernel's inputs, output and conversion policy. * * Valid configurations (Input1,Input2) -> Output : @@ -57,7 +116,7 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 @@ -69,6 +128,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEARITHMETICADDITION_H */ diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h index 31d1698aea..0bab911c1a 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h @@ -25,12 +25,52 @@ #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunction.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/NEON/INEOperator.h" namespace arm_compute { class ITensor; +namespace experimental +{ +/** Basic function to run @ref NEArithmeticSubtractionKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32. + * @note The function performs an arithmetic subtraction between two tensors. + * + * This function calls the following kernels: + * -# @ref NEArithmeticSubtractionKernel + */ +class NEArithmeticSubtraction : public INEOperator +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32 + * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32 + * @param[out] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32 + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction + * + * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32 + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32 + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + MemoryRequirements workspace() const override; +}; +} // namespace experimental + /** Basic function to run @ref NEArithmeticSubtractionKernel * * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32. @@ -39,9 +79,21 @@ class ITensor; * This function calls the following kernels: * -# @ref NEArithmeticSubtractionKernel */ -class NEArithmeticSubtraction : public INESimpleFunction +class NEArithmeticSubtraction : public IFunction { public: + /** Default Constructor */ + NEArithmeticSubtraction(); + /** Default Destructor */ + ~NEArithmeticSubtraction(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticSubtraction(const NEArithmeticSubtraction &) = delete; + /** Default move constructor */ + NEArithmeticSubtraction(NEArithmeticSubtraction &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticSubtraction &operator=(const NEArithmeticSubtraction &) = delete; + /** Default move assignment operator */ + NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&); /** Initialise the kernel's inputs, output and conversion policy. * * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32 @@ -50,7 +102,7 @@ public: * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction * * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32 @@ -62,6 +114,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTION_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 8dc6b88bb0..b89a373c47 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -24,7 +24,6 @@ #ifndef ARM_COMPUTE_NEGEMM_H #define ARM_COMPUTE_NEGEMM_H -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" @@ -35,6 +34,7 @@ #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" #include "arm_compute/runtime/Tensor.h" @@ -112,7 +112,7 @@ private: NEGEMMAssemblyDispatch _asm_glue; NEGEMMMatrixAdditionKernel _ma_kernel; NEActivationLayer _alpha_scale_func; - NEArithmeticAdditionKernel _add_bias_kernel; + NEArithmeticAddition _add_bias; NEActivationLayer _activation_func; Tensor _tmp_a; diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index 64845115b8..b9b581c484 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -25,13 +25,13 @@ #define ARM_COMPUTE_NELSTMLAYER_H #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" #include "arm_compute/core/NEON/kernels/NECopyKernel.h" #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" @@ -149,7 +149,7 @@ private: MemoryGroup _memory_group; NEFullyConnectedLayer _fully_connected_input_gate; NEArithmeticAddition _accum_input_gate1; - NEArithmeticSubtractionKernel _subtract_input_gate; + NEArithmeticSubtraction _subtract_input_gate; NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate; NEActivationLayer _activation_input_gate; NEFullyConnectedLayer _fully_connected_forget_gate; @@ -159,8 +159,8 @@ private: NEFullyConnectedLayer _fully_connected_cell_state; NEGEMM _gemm_cell_state1; NETransposeKernel _transpose_cell_state; - NEArithmeticAdditionKernel _accum_cell_state1; - NEArithmeticAdditionKernel _accum_cell_state2; + NEArithmeticAddition _accum_cell_state1; + NEArithmeticAddition _accum_cell_state2; NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1; NEActivationLayer _activation_cell_state; NEActivationLayer _cell_clip; @@ -182,16 +182,16 @@ private: NEConcatenateLayer _concat_weights_output; NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate; NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff; - NEArithmeticAdditionKernel _accum_input_gate_bias; + NEArithmeticAddition _accum_input_gate_bias; NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff; - NEArithmeticAdditionKernel _accum_forget_gate_bias; + NEArithmeticAddition _accum_forget_gate_bias; NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff; - NEArithmeticAdditionKernel _accum_cell_gate_bias; + NEArithmeticAddition _accum_cell_gate_bias; NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate; NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff; - NEArithmeticAdditionKernel _accum_output_gate_bias; + NEArithmeticAddition _accum_output_gate_bias; Tensor _input_gate_out1; Tensor _input_gate_out2; Tensor _input_gate_out3; diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index d1cc962940..60c8fa1226 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -24,14 +24,14 @@ #ifndef ARM_COMPUTE_NEQLSTMLAYER_H #define ARM_COMPUTE_NEQLSTMLAYER_H -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" #include "arm_compute/core/NEON/kernels/NECopyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" #include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" @@ -48,7 +48,7 @@ class ITensor; * This function calls the following NEON functions/kernels: * * -# @ref NEActivationLayer Activation functions (tanh and logistic) - * -# @ref NEArithmeticAdditionKernel Elementwise addition + * -# @ref NEArithmeticAddition Elementwise addition * -# @ref NEArithmeticSubtractionKernel Elementwise subtraction * -# @ref NECopyKernel Copy kernel for copying output_state_out to output * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers @@ -254,51 +254,51 @@ private: NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{}; NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{}; NEGEMMLowpMatrixAReductionKernel _projection_reduction{}; - NEArithmeticAdditionKernel _projection_bias_add{}; + NEArithmeticAddition _projection_bias_add{}; NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_forget{}; NEGEMMLowpOutputStage _input_to_forget_outstage{}; NEGEMMLowpOutputStage _recurrent_to_forget_outstage{}; NEGEMMLowpOutputStage _cell_to_forget_outstage{}; - NEArithmeticAdditionKernel _accumulate_input_recurrent_forget{}; - NEArithmeticAdditionKernel _accumulate_cell_forget{}; + NEArithmeticAddition _accumulate_input_recurrent_forget{}; + NEArithmeticAddition _accumulate_cell_forget{}; NEActivationLayer _forget_gate_sigmoid{}; NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; NEGEMMLowpOutputStage _input_to_cell_outstage{}; NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; NEGEMMLowpOutputStage _recurrent_to_cell_outstage{}; - NEArithmeticAdditionKernel _accumulate_input_recurrent_modulation{}; + NEArithmeticAddition _accumulate_input_recurrent_modulation{}; NEActivationLayer _cell_gate_tanh{}; - NEArithmeticSubtractionKernel _input_gate_sub{}; + NEArithmeticSubtraction _input_gate_sub{}; NEGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; NEGEMMLowpOutputStage _input_to_input_outstage{}; NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; NEGEMMLowpOutputStage _recurrent_to_input_outstage{}; - NEArithmeticAdditionKernel _accumulate_input_recurrent_input{}; + NEArithmeticAddition _accumulate_input_recurrent_input{}; NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_input{}; NEGEMMLowpOutputStage _cell_to_input_outstage{}; - NEArithmeticAdditionKernel _accumulate_cell_input{}; + NEArithmeticAddition _accumulate_cell_input{}; NEActivationLayer _input_gate_sigmoid{}; NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_cell{}; NEPixelWiseMultiplicationKernel _pixelwise_mul_input_cell{}; - NEArithmeticAdditionKernel _add_forget_cell{}; + NEArithmeticAddition _add_forget_cell{}; NEActivationLayer _cell_clip{}; NEGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; NEGEMMLowpOutputStage _input_to_output_outstage{}; NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; NEGEMMLowpOutputStage _recurrent_to_output_outstage{}; - NEArithmeticAdditionKernel _accumulate_input_recurrent_output{}; + NEArithmeticAddition _accumulate_input_recurrent_output{}; NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_output{}; NEGEMMLowpOutputStage _cell_to_output_outstage{}; - NEArithmeticAdditionKernel _accumulate_cell_to_output{}; + NEArithmeticAddition _accumulate_cell_to_output{}; NEActivationLayer _output_gate_sigmoid{}; NEActivationLayer _hidden_tanh{}; NEPixelWiseMultiplicationKernel _pixelwise_mul_hidden{}; NEGEMMLowpOutputStage _hidden_outstage{}; NEGEMMLowpMatrixMultiplyCore _mm_projection{}; NEGEMMLowpOutputStage _projection_outstage{}; - NEArithmeticAdditionKernel _accumulate_projection{}; + NEArithmeticAddition _accumulate_projection{}; NEActivationLayer _projection_clip{}; TensorCopyKernel _projection_bias_copy{}; @@ -311,7 +311,10 @@ private: NECopyKernel _copy_output{}; // Tensor pointers - const ITensor *_input_to_input_weights{ nullptr }; + const ITensor *_input_to_input_weights + { + nullptr + }; const ITensor *_recurrent_to_input_weights{ nullptr }; const ITensor *_projection_bias{ nullptr }; const ITensor *_input_to_forget_weights{ nullptr }; @@ -370,7 +373,10 @@ private: { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out{ in }; + const TensorInfo out + { + in + }; return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h index db4134fd2d..25cb74d978 100644 --- a/arm_compute/runtime/NEON/functions/NERNNLayer.h +++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_NERNNLAYER_H #define ARM_COMPUTE_NERNNLAYER_H -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NECopyKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" @@ -82,16 +82,16 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - NEGEMM _gemm_state_f; - NEArithmeticAdditionKernel _add_kernel; - NEActivationLayer _activation; - NEFullyConnectedLayer _fully_connected; - NECopyKernel _copy_kernel; - Tensor _fully_connected_out; - Tensor _gemm_output; - Tensor _add_output; - bool _is_prepared; + MemoryGroup _memory_group; + NEGEMM _gemm_state_f; + NEArithmeticAddition _add_f; + NEActivationLayer _activation; + NEFullyConnectedLayer _fully_connected; + NECopyKernel _copy_kernel; + Tensor _fully_connected_out; + Tensor _gemm_output; + Tensor _add_output; + bool _is_prepared; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NERNNLAYER_H */ diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index 3878c764a6..1459f7f250 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -853,7 +853,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, return Status{}; } -std::pair validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +std::pair validate_and_configure_window(const ITensorInfo &input1, const ITensorInfo &input2, ITensorInfo &output) { const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2); const TensorShape &out_shape = broadcast_pair.first; @@ -904,17 +904,17 @@ std::pair validate_and_configure_window(ITensorInfo &input1, ITe } // namespace NEArithmeticAdditionKernel::NEArithmeticAdditionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy() + : _func(nullptr), _policy() { } -void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy) +void NEArithmeticAdditionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy)); // Configure kernel window - auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info()); + auto win_config = validate_and_configure_window(*input1, *input2, *output); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); static std::map map_function = @@ -945,16 +945,13 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ }; - _input1 = input1; - _input2 = input2; - _output = output; _policy = policy; std::string function_to_call("add_"); function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; - function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; - function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; - function_to_call += string_from_data_type(output->info()->data_type()); + function_to_call += string_from_data_type(input1->data_type()) + "_"; + function_to_call += string_from_data_type(input2->data_type()) + "_"; + function_to_call += string_from_data_type(output->data_type()); auto it = map_function.find(function_to_call); @@ -976,13 +973,12 @@ Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITe return Status{}; } -void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info) +void NEArithmeticAdditionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input1, _input2, _output, _policy, window); + // Dispatch kernel + (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), _policy, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index 2b3fce3fea..2097d761a7 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -719,35 +719,32 @@ inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &i } // namespace NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy(ConvertPolicy::WRAP) + : _func(nullptr), _policy(ConvertPolicy::WRAP) { } -void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy) +void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy)); - _input1 = input1; - _input2 = input2; - _output = output; _policy = policy; - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); const TensorShape &out_shape = broadcast_pair.first; const ValidRegion &valid_region = broadcast_pair.second; // Auto initialize output if not initialized - set_shape_if_empty(*output->info(), out_shape); + set_shape_if_empty(*output, out_shape); - switch(input1->info()->data_type()) + switch(input1->data_type()) { case DataType::U8: - if(input2->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::U8) + if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8) { _func = &sub_same; } - else if(input2->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::S16) + else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) { _func = &sub_U8_U8_S16; } @@ -758,14 +755,14 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens break; case DataType::QASYMM8: _func = &sub_quantized; - set_data_type_if_unknown(*output->info(), DataType::QASYMM8); + set_data_type_if_unknown(*output, DataType::QASYMM8); break; case DataType::QASYMM8_SIGNED: _func = &sub_quantized; - set_data_type_if_unknown(*output->info(), DataType::QASYMM8_SIGNED); + set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED); break; case DataType::S16: - if(input2->info()->data_type() == DataType::U8) + if(input2->data_type() == DataType::U8) { _func = &sub_S16_U8_S16; } @@ -773,21 +770,21 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens { _func = &sub_same; } - set_format_if_unknown(*output->info(), Format::S16); + set_format_if_unknown(*output, Format::S16); break; case DataType::QSYMM16: _func = &sub_QSYMM16_QSYMM16_QSYMM16; - set_data_type_if_unknown(*output->info(), DataType::QSYMM16); + set_data_type_if_unknown(*output, DataType::QSYMM16); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &sub_same; - set_format_if_unknown(*output->info(), Format::F16); + set_format_if_unknown(*output, Format::F16); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: _func = &sub_same; - set_format_if_unknown(*output->info(), Format::F32); + set_format_if_unknown(*output, Format::F32); break; default: _func = nullptr; @@ -795,8 +792,8 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens // NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(valid_region); + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(valid_region); Window win = calculate_max_window(valid_region, Steps()); INEKernel::configure(win); @@ -810,13 +807,12 @@ Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const return Status{}; } -void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info) +void NEArithmeticSubtractionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input1, _input2, _output, window, (_policy == ConvertPolicy::SATURATE)); + // Dispatch kernel + (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), window, (_policy == ConvertPolicy::SATURATE)); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index 06c71db1bd..3a2848c3a7 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -31,7 +31,9 @@ namespace arm_compute { -void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +namespace experimental +{ +void NEArithmeticAddition::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); auto k = arm_compute::support::cpp14::make_unique(); @@ -43,4 +45,46 @@ Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return NEArithmeticAdditionKernel::validate(input1, input2, output, policy); } +MemoryRequirements NEArithmeticAddition::workspace() const +{ + return MemoryRequirements{}; +} +} // namespace experimental + +struct NEArithmeticAddition::Impl +{ + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +NEArithmeticAddition::NEArithmeticAddition() + : _impl(support::cpp14::make_unique()) +{ +} +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; +NEArithmeticAddition::~NEArithmeticAddition() = default; + +Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + return experimental::NEArithmeticAddition::validate(input1, input2, output, policy, act_info); +} + +void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = arm_compute::support::cpp14::make_unique(); + _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info); +} + +void NEArithmeticAddition::run() +{ + const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } }; + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + _impl->op->run(src, dst, {}); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 20f930a286..043250ca68 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -31,7 +31,9 @@ namespace arm_compute { -void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +namespace experimental +{ +void NEArithmeticSubtraction::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); auto k = arm_compute::support::cpp14::make_unique(); @@ -44,4 +46,47 @@ Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITenso ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy); } + +MemoryRequirements NEArithmeticSubtraction::workspace() const +{ + return MemoryRequirements{}; +} +} // namespace experimental + +struct NEArithmeticSubtraction::Impl +{ + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +NEArithmeticSubtraction::NEArithmeticSubtraction() + : _impl(support::cpp14::make_unique()) +{ +} +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; + +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + return experimental::NEArithmeticSubtraction::validate(input1, input2, output, policy, act_info); +} + +void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = arm_compute::support::cpp14::make_unique(); + _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info); +} + +void NEArithmeticSubtraction::run() +{ + const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } }; + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + _impl->op->run(src, dst, {}); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 89dd4a15d0..5fc12e585a 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -43,7 +43,7 @@ namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(), - _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), + _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -141,7 +141,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe if(_run_bias_addition) { - _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); + _add_bias.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); _tmp_d.allocator()->allocate(); } } @@ -258,7 +258,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso if(c != nullptr && gemm_info.reshape_b_only_on_first_run()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); } } @@ -311,7 +311,7 @@ void NEGEMM::run() // Run bias addition kernel if(_run_bias_addition) { - NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY); + _add_bias.run(); } } diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index 0a111363e3..d8c684bf15 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -512,7 +512,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state @@ -610,7 +610,7 @@ void NELSTMLayer::run() { _mean_std_norm_forget_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_forget_gate_bias, Window::DimY); + _accum_forget_gate_bias.run(); } _activation_forget_gate.run(); @@ -624,7 +624,7 @@ void NELSTMLayer::run() { std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); } - NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY); + _subtract_input_gate.run(); } else { @@ -640,7 +640,7 @@ void NELSTMLayer::run() { _mean_std_norm_input_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_input_gate_bias, Window::DimY); + _accum_input_gate_bias.run(); } _activation_input_gate.run(); } @@ -648,17 +648,17 @@ void NELSTMLayer::run() _fully_connected_cell_state.run(); NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY); _gemm_cell_state1.run(); - NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY); + _accum_cell_state1.run(); if(_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_cell_gate_bias, Window::DimY); + _accum_cell_gate_bias.run(); } _activation_cell_state.run(); NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY); NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY); - NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY); + _accum_cell_state2.run(); if(_perform_cell_clipping) { @@ -675,7 +675,7 @@ void NELSTMLayer::run() { _mean_std_norm_output_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_output_gate_bias, Window::DimY); + _accum_output_gate_bias.run(); } _activation_output.run(); diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index a22c669ca7..6eb1844a1f 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -619,7 +619,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } @@ -662,7 +662,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); if(lstm_params.has_peephole_opt()) { @@ -672,7 +672,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); } if(has_layer_norm) @@ -697,7 +697,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); if(has_layer_norm) { @@ -714,7 +714,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); } else { @@ -733,7 +733,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); if(lstm_params.has_peephole_opt()) { @@ -742,7 +742,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); } if(has_layer_norm) @@ -757,7 +757,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Cell. ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); if(quantized_cell_clip > 0) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, @@ -772,7 +772,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); @@ -782,7 +782,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); } if(has_layer_norm) @@ -837,7 +837,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); if(projection_tensor_copy_required) { @@ -893,13 +893,13 @@ void NEQLSTMLayer::run() _mm_recurrent_to_forget.run(); _recurrent_to_forget_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_forget, Window::DimY); + _accumulate_input_recurrent_forget.run(); if(_has_peephole) { NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY); _cell_to_forget_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_forget, Window::DimY); + _accumulate_cell_forget.run(); } if(_has_layer_norm) @@ -915,7 +915,7 @@ void NEQLSTMLayer::run() _mm_recurrent_to_cell.run(); _recurrent_to_cell_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_modulation, Window::DimY); + _accumulate_input_recurrent_modulation.run(); if(_has_layer_norm) { @@ -927,7 +927,7 @@ void NEQLSTMLayer::run() // Input gate if(_has_cifg) { - NEScheduler::get().schedule(&_input_gate_sub, Window::DimY); + _input_gate_sub.run(); } else { @@ -935,13 +935,13 @@ void NEQLSTMLayer::run() _input_to_input_outstage.run(); _mm_recurrent_to_input.run(); _recurrent_to_input_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_input, Window::DimY); + _accumulate_input_recurrent_input.run(); if(_has_peephole) { NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY); _cell_to_input_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_input, Window::DimY); + _accumulate_cell_input.run(); } if(_has_layer_norm) @@ -955,7 +955,8 @@ void NEQLSTMLayer::run() // Cell. NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY); NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY); - NEScheduler::get().schedule(&_add_forget_cell, Window::DimY); + _add_forget_cell.run(); + if(_has_cell_clipping) { _cell_clip.run(); @@ -966,12 +967,12 @@ void NEQLSTMLayer::run() _input_to_output_outstage.run(); _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_output, Window::DimY); + _accumulate_input_recurrent_output.run(); if(_has_peephole) { NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY); _cell_to_output_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_to_output, Window::DimY); + _accumulate_cell_to_output.run(); } if(_has_layer_norm) @@ -997,7 +998,7 @@ void NEQLSTMLayer::run() _projection_output_to_accumulate_copy.run(); } - NEScheduler::get().schedule(&_accumulate_projection, Window::DimY); + _accumulate_projection.run(); if(_projection_tensor_copy_required) { @@ -1077,7 +1078,7 @@ void NEQLSTMLayer::prepare() NEScheduler::get().schedule(&_projection_reduction, Window::DimY); if(_projection_bias != nullptr) { - NEScheduler::get().schedule(&_projection_bias_add, Window::DimY); + _projection_bias_add.run(); _projection_bias->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index 4a15777be9..19b84e7fb8 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -34,7 +34,7 @@ namespace arm_compute { NERNNLayer::NERNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), _is_prepared(false) { } @@ -59,7 +59,7 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; @@ -90,7 +90,7 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _memory_group.manage(&_add_output); - _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); + _add_f.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); _fully_connected_out.allocator()->allocate(); _gemm_output.allocator()->allocate(); @@ -111,7 +111,7 @@ void NERNNLayer::run() _gemm_state_f.run(); - NEScheduler::get().schedule(&_add_kernel, Window::DimY); + _add_f.run(); _activation.run(); // copy hidden out to output -- cgit v1.2.1