From ad7515d231acb075a9585e52f257373b1a1b5d1f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 24 Jul 2020 00:02:23 +0100 Subject: COMPMID-3385: Async support to CLArithmetic* kernels/functions Pt.1 Signed-off-by: Michalis Spyrou Change-Id: I94007565e688f8a0aead4f14c9fc30bfd9f9f7eb Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3613 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../core/CL/kernels/CLElementwiseOperationKernel.h | 56 +-- arm_compute/core/CL/kernels/CLFillBorderKernel.h | 12 +- .../runtime/CL/functions/CLElementwiseOperations.h | 500 ++++++++++++++++++++- arm_compute/runtime/CL/functions/CLLSTMLayer.h | 169 ++++--- arm_compute/runtime/CL/functions/CLPReluLayer.h | 65 ++- arm_compute/runtime/CL/functions/CLQLSTMLayer.h | 137 +++--- arm_compute/runtime/CL/functions/CLRNNLayer.h | 22 +- 7 files changed, 753 insertions(+), 208 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h index 1995aed7b6..76bc879638 100644 --- a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h +++ b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h @@ -54,7 +54,7 @@ public: ~CLElementwiseOperationKernel() = default; // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; @@ -64,9 +64,9 @@ protected: /** Initialise the kernel's output. * - * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32. - * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[in] output Output tensor. Data types supported: Same as @p input1. + * @param[in] input1 First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. * * @return a pair of Status and Window */ @@ -87,18 +87,18 @@ protected: /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff) * */ - void configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + void configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output); /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff) * */ - void configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + void configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output); ActivationLayerInfo _act_info; private: - const ICLTensor *_input1; /**< Source tensor 1 */ - const ICLTensor *_input2; /**< Source tensor 2 */ - ICLTensor *_output; /**< Destination tensor */ + const ITensorInfo *_input1; /**< Source tensor info 1 */ + const ITensorInfo *_input2; /**< Source tensor info 2 */ + ITensorInfo *_output; /**< Destination tensor info */ }; /** Addition operation */ @@ -113,32 +113,32 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel * * @param[in] op Arithmetic operation to be executed. - * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[in] output Output tensor. Data types supported: Same as @p input1. + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel * * @param[in] compile_context The compile context to be used. * @param[in] op Arithmetic operation to be executed. - * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[in] output Output tensor. Data types supported: Same as @p input1. + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, + void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel * * @param[in] op Arithmetic operation to be executed. - * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. - * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] input1 First tensor input info info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input2 Second tensor input info info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info info. Data types supported: Same as @p input1. * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @@ -170,22 +170,22 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel * * @param[in] op Arithmetic operation to be executed. - * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[in] output Output tensor. Data types supported: Same as @p input1. + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel * * @param[in] compile_context The compile context to be used. * @param[in] op Arithmetic operation to be executed. - * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[in] output Output tensor. Data types supported: Same as @p input1. + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h index 0a4de25ac3..8cad68dc1a 100644 --- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h +++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h @@ -49,6 +49,15 @@ public: /** Default destructor */ ~CLFillBorderKernel() = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. + * @param[in] border_size Size of the border to fill in elements. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. @@ -65,7 +74,7 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); /** Function to set the constant value on fill border kernel depending on type. * @@ -76,6 +85,7 @@ public: void set_constant_border(unsigned int idx, const PixelValue &constant_border_value); // Inherited methods overridden: + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue) override; void run(const Window &window, cl::CommandQueue &queue) override; bool is_parallelisable() const override; diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h index 9cd3c150cc..5af24c90ac 100644 --- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h +++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h @@ -24,21 +24,372 @@ #ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H #define ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/runtime/CL/ICLOperator.h" +#include "arm_compute/runtime/IFunction.h" namespace arm_compute { class ICLTensor; +namespace experimental +{ +/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition + * + * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @note The function performs an arithmetic addition between two tensors. + */ +class CLArithmeticAddition : public ICLOperator +{ +public: + /** Default Constructor */ + CLArithmeticAddition(); + /** Initialise the kernel's inputs, output and conversion policy. + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; + +/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction + * + * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32. + * @note The function performs an arithmetic subtraction between two tensors. + */ +class CLArithmeticSubtraction : public ICLOperator +{ +public: + /** Default Constructor */ + CLArithmeticSubtraction(); + /** Initialise the kernel's inputs, output and conversion policy. + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; + +/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs an arithmetic division between two tensors. + */ +class CLArithmeticDivision : public ICLOperator +{ +public: + /** Default Constructor */ + CLArithmeticDivision(); + /** Initialise the kernel's inputs, output. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision + * + * @param[in] input1 First tensor input info. Data types supported: F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; + +/** Basic function to run @ref CLArithmeticOperationKernel for max + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * @note The function performs a max operation between two tensors. + */ +class CLElementwiseMax : public ICLOperator +{ +public: + /** Default Constructor */ + CLElementwiseMax(); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for max + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: same as @p input1. + * @param[in] output Output tensor info. Data types supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; + +/** Basic function to run @ref CLArithmeticOperationKernel for min + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * @note The function performs a max operation between two tensors. + */ +class CLElementwiseMin : public ICLOperator +{ +public: + /** Default Constructor */ + CLElementwiseMin(); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for min + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: same as @p input1. + * @param[in] output Output tensor info. Data types supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; + +/** Basic function to run @ref CLArithmeticOperationKernel for squared difference + * + * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32. + * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2 + */ +class CLElementwiseSquaredDiff : public ICLOperator +{ +public: + /** Default Constructor */ + CLElementwiseSquaredDiff(); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for squared difference + * + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: same as @p input1. + * @param[in] output Output tensor info. Data types supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; + +/** Basic function to run @ref CLArithmeticOperationKernel for power + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) + */ +class CLElementwisePower : public ICLOperator +{ +public: + /** Default Constructor */ + CLElementwisePower(); + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] input1 First tensor input. Data types supported: F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 Second tensor input. Data types supported: F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported:F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for power + * + * @param[in] input1 First tensor input info. Data types supported: F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; +} // namespace experimental + /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition * * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. * @note The function performs an arithmetic addition between two tensors. */ -class CLArithmeticAddition : public ICLSimpleFunction +class CLArithmeticAddition : public IFunction { public: + /** Default Constructor */ + CLArithmeticAddition(); + /** Default Destructor */ + ~CLArithmeticAddition(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticAddition(const CLArithmeticAddition &) = delete; + /** Default move constructor */ + CLArithmeticAddition(CLArithmeticAddition &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticAddition &operator=(const CLArithmeticAddition &) = delete; + /** Default move assignment operator */ + CLArithmeticAddition &operator=(CLArithmeticAddition &&); /** Initialise the kernel's inputs, output and conversion policy. * * Valid configurations (Input1,Input2) -> Output : @@ -89,7 +440,8 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition * * Valid configurations (Input1,Input2) -> Output : @@ -115,6 +467,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction @@ -122,9 +481,21 @@ public: * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32. * @note The function performs an arithmetic subtraction between two tensors. */ -class CLArithmeticSubtraction : public ICLSimpleFunction +class CLArithmeticSubtraction : public IFunction { public: + /** Default Constructor */ + CLArithmeticSubtraction(); + /** Default Destructor */ + ~CLArithmeticSubtraction(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticSubtraction(const CLArithmeticSubtraction &) = delete; + /** Default move constructor */ + CLArithmeticSubtraction(CLArithmeticSubtraction &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticSubtraction &operator=(const CLArithmeticSubtraction &) = delete; + /** Default move assignment operator */ + CLArithmeticSubtraction &operator=(CLArithmeticSubtraction &&); /** Initialise the kernel's inputs, output and conversion policy. * * Valid configurations (Input1,Input2) -> Output : @@ -149,7 +520,7 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Initialise the kernel's inputs, output and conversion policy. * * Valid configurations (Input1,Input2) -> Output : @@ -175,7 +546,8 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction * * Valid configurations (Input1,Input2) -> Output : @@ -201,6 +573,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division @@ -208,9 +587,21 @@ public: * @note The tensor data type for the inputs must be F16/F32. * @note The function performs an arithmetic division between two tensors. */ -class CLArithmeticDivision : public ICLSimpleFunction +class CLArithmeticDivision : public IFunction { public: + /** Default Constructor */ + CLArithmeticDivision(); + /** Default Destructor */ + ~CLArithmeticDivision(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticDivision(const CLArithmeticDivision &) = delete; + /** Default move constructor */ + CLArithmeticDivision(CLArithmeticDivision &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArithmeticDivision &operator=(const CLArithmeticDivision &) = delete; + /** Default move assignment operator */ + CLArithmeticDivision &operator=(CLArithmeticDivision &&); /** Initialise the kernel's inputs, output. * * @param[in, out] input1 First tensor input. Data types supported: F16/F32. @@ -231,7 +622,7 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision * * @param[in] input1 First tensor input info. Data types supported: F16/F32. @@ -242,6 +633,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; /** Basic function to run @ref CLArithmeticOperationKernel for max @@ -249,9 +647,21 @@ public: * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. * @note The function performs a max operation between two tensors. */ -class CLElementwiseMax : public ICLSimpleFunction +class CLElementwiseMax : public IFunction { public: + /** Default Constructor */ + CLElementwiseMax(); + /** Default Destructor */ + ~CLElementwiseMax(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwiseMax(const CLElementwiseMax &) = delete; + /** Default move constructor */ + CLElementwiseMax(CLElementwiseMax &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwiseMax &operator=(const CLElementwiseMax &) = delete; + /** Default move assignment operator */ + CLElementwiseMax &operator=(CLElementwiseMax &&); /** Initialise the kernel's inputs, output and conversion policy. * * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. @@ -283,6 +693,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; /** Basic function to run @ref CLArithmeticOperationKernel for min @@ -290,9 +707,21 @@ public: * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. * @note The function performs a max operation between two tensors. */ -class CLElementwiseMin : public ICLSimpleFunction +class CLElementwiseMin : public IFunction { public: + /** Default Constructor */ + CLElementwiseMin(); + /** Default Destructor */ + ~CLElementwiseMin(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwiseMin(const CLElementwiseMin &) = delete; + /** Default move constructor */ + CLElementwiseMin(CLElementwiseMin &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwiseMin &operator=(const CLElementwiseMin &) = delete; + /** Default move assignment operator */ + CLElementwiseMin &operator=(CLElementwiseMin &&); /** Initialise the kernel's inputs, output and conversion policy. * * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. @@ -324,6 +753,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; /** Basic function to run @ref CLArithmeticOperationKernel for squared difference @@ -331,9 +767,21 @@ public: * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32. * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2 */ -class CLElementwiseSquaredDiff : public ICLSimpleFunction +class CLElementwiseSquaredDiff : public IFunction { public: + /** Default Constructor */ + CLElementwiseSquaredDiff(); + /** Default Destructor */ + ~CLElementwiseSquaredDiff(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwiseSquaredDiff(const CLElementwiseSquaredDiff &) = delete; + /** Default move constructor */ + CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwiseSquaredDiff &operator=(const CLElementwiseSquaredDiff &) = delete; + /** Default move assignment operator */ + CLElementwiseSquaredDiff &operator=(CLElementwiseSquaredDiff &&); /** Initialise the kernel's inputs, output and conversion policy. * * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. @@ -365,6 +813,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; /** Basic function to run @ref CLArithmeticOperationKernel for power @@ -372,9 +827,21 @@ public: * @note The tensor data type for the inputs must be F16/F32. * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) */ -class CLElementwisePower : public ICLSimpleFunction +class CLElementwisePower : public IFunction { public: + /** Default Constructor */ + CLElementwisePower(); + /** Default Destructor */ + ~CLElementwisePower(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwisePower(const CLElementwisePower &) = delete; + /** Default move constructor */ + CLElementwisePower(CLElementwisePower &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLElementwisePower &operator=(const CLElementwisePower &) = delete; + /** Default move assignment operator */ + CLElementwisePower &operator=(CLElementwisePower &&); /** Initialise the kernel's inputs, output and conversion policy. * * @param[in, out] input1 First tensor input. Data types supported: F16/F32. @@ -406,6 +873,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H */ diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h index e5733cd784..abfcc3a62f 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h @@ -27,7 +27,6 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "arm_compute/core/Types.h" @@ -201,90 +200,90 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - CLFullyConnectedLayer _fully_connected_input_gate; - CLArithmeticAddition _accum_input_gate1; - CLSaturatedArithmeticOperationKernel _subtract_input_gate; - CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate; - CLActivationLayer _activation_input_gate; - CLFullyConnectedLayer _fully_connected_forget_gate; - CLArithmeticAddition _accum_forget_gate1; - CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate; - CLActivationLayer _activation_forget_gate; - CLFullyConnectedLayer _fully_connected_cell_state; - CLGEMM _gemm_cell_state1; - CLTransposeKernel _transpose_cell_state; - CLSaturatedArithmeticOperationKernel _accum_cell_state1; - CLSaturatedArithmeticOperationKernel _accum_cell_state2; - CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1; - CLActivationLayer _activation_cell_state; - CLActivationLayer _cell_clip; - CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2; - CLFullyConnectedLayer _fully_connected_output; - CLPixelWiseMultiplicationKernel _pixelwise_mul_output_state1; - CLArithmeticAddition _accum_output1; - CLActivationLayer _activation_output; - CLActivationLayer _activation_output_state; - CLPixelWiseMultiplicationKernel _pixelwise_mul_output_state2; - CLFullyConnectedLayer _fully_connected_output_state; - CLActivationLayer _projection_clip; - CLCopyKernel _copy_cell_state; - CLCopyKernel _copy_output; - CLConcatenateLayer _concat_scratch_buffer; - CLConcatenateLayer _concat_inputs_forget_gate; - CLConcatenateLayer _concat_weights_forget_gate; - CLConcatenateLayer _concat_weights_input_gate; - CLConcatenateLayer _concat_weights_output; - CLMemsetKernel _ones_memset_kernel; - CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate; - CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff; - CLSaturatedArithmeticOperationKernel _accum_input_gate_bias; - CLMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; - CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff; - CLSaturatedArithmeticOperationKernel _accum_forget_gate_bias; - CLMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; - CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff; - CLSaturatedArithmeticOperationKernel _accum_cell_gate_bias; - CLMeanStdDevNormalizationLayer _mean_std_norm_output_gate; - CLPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff; - CLSaturatedArithmeticOperationKernel _accum_output_gate_bias; - CLTensor _input_gate_out1; - CLTensor _input_gate_out2; - CLTensor _input_gate_out3; - CLTensor _input_gate_out4; - CLTensor _forget_gate_out1; - CLTensor _forget_gate_out2; - CLTensor _forget_gate_out3; - CLTensor _forget_gate_out4; - CLTensor _forget_gate_out5; - CLTensor _forget_gate_out6; - CLTensor _cell_state_out1; - CLTensor _cell_state_out2; - CLTensor _cell_state_out3; - CLTensor _cell_state_out4; - CLTensor _cell_state_out5; - CLTensor _output1; - CLTensor _output2; - CLTensor _output3; - CLTensor _output4; - CLTensor _cell_state_activation; - CLTensor _output_state1; - CLTensor _ones; - CLTensor _input_layer_norm_out1; - CLTensor _input_layer_norm_out2; - CLTensor _forget_layer_norm_out1; - CLTensor _forget_layer_norm_out2; - CLTensor _cell_layer_norm_out1; - CLTensor _cell_layer_norm_out2; - CLTensor _output_layer_norm_out1; - CLTensor _output_layer_norm_out2; - bool _run_peephole_opt; - bool _run_cifg_opt; - bool _perform_cell_clipping; - bool _has_projection_weights; - bool _perform_projection_clipping; - bool _is_prepared; - bool _is_layer_norm_lstm; + MemoryGroup _memory_group; + CLFullyConnectedLayer _fully_connected_input_gate; + CLArithmeticAddition _accum_input_gate1; + CLArithmeticSubtraction _subtract_input_gate; + CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate; + CLActivationLayer _activation_input_gate; + CLFullyConnectedLayer _fully_connected_forget_gate; + CLArithmeticAddition _accum_forget_gate1; + CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate; + CLActivationLayer _activation_forget_gate; + CLFullyConnectedLayer _fully_connected_cell_state; + CLGEMM _gemm_cell_state1; + CLTransposeKernel _transpose_cell_state; + CLArithmeticAddition _accum_cell_state1; + CLArithmeticAddition _accum_cell_state2; + CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1; + CLActivationLayer _activation_cell_state; + CLActivationLayer _cell_clip; + CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2; + CLFullyConnectedLayer _fully_connected_output; + CLPixelWiseMultiplicationKernel _pixelwise_mul_output_state1; + CLArithmeticAddition _accum_output1; + CLActivationLayer _activation_output; + CLActivationLayer _activation_output_state; + CLPixelWiseMultiplicationKernel _pixelwise_mul_output_state2; + CLFullyConnectedLayer _fully_connected_output_state; + CLActivationLayer _projection_clip; + CLCopyKernel _copy_cell_state; + CLCopyKernel _copy_output; + CLConcatenateLayer _concat_scratch_buffer; + CLConcatenateLayer _concat_inputs_forget_gate; + CLConcatenateLayer _concat_weights_forget_gate; + CLConcatenateLayer _concat_weights_input_gate; + CLConcatenateLayer _concat_weights_output; + CLMemsetKernel _ones_memset_kernel; + CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate; + CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff; + CLArithmeticAddition _accum_input_gate_bias; + CLMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; + CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff; + CLArithmeticAddition _accum_forget_gate_bias; + CLMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; + CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff; + CLArithmeticAddition _accum_cell_gate_bias; + CLMeanStdDevNormalizationLayer _mean_std_norm_output_gate; + CLPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff; + CLArithmeticAddition _accum_output_gate_bias; + CLTensor _input_gate_out1; + CLTensor _input_gate_out2; + CLTensor _input_gate_out3; + CLTensor _input_gate_out4; + CLTensor _forget_gate_out1; + CLTensor _forget_gate_out2; + CLTensor _forget_gate_out3; + CLTensor _forget_gate_out4; + CLTensor _forget_gate_out5; + CLTensor _forget_gate_out6; + CLTensor _cell_state_out1; + CLTensor _cell_state_out2; + CLTensor _cell_state_out3; + CLTensor _cell_state_out4; + CLTensor _cell_state_out5; + CLTensor _output1; + CLTensor _output2; + CLTensor _output3; + CLTensor _output4; + CLTensor _cell_state_activation; + CLTensor _output_state1; + CLTensor _ones; + CLTensor _input_layer_norm_out1; + CLTensor _input_layer_norm_out2; + CLTensor _forget_layer_norm_out1; + CLTensor _forget_layer_norm_out2; + CLTensor _cell_layer_norm_out1; + CLTensor _cell_layer_norm_out2; + CLTensor _output_layer_norm_out1; + CLTensor _output_layer_norm_out2; + bool _run_peephole_opt; + bool _run_cifg_opt; + bool _perform_cell_clipping; + bool _has_projection_weights; + bool _perform_projection_clipping; + bool _is_prepared; + bool _is_layer_norm_lstm; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLLSTMLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h index eb3d3be3e3..08567cccfb 100644 --- a/arm_compute/runtime/CL/functions/CLPReluLayer.h +++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h @@ -24,20 +24,72 @@ #ifndef ARM_COMPUTE_CLPRELULAYER_H #define ARM_COMPUTE_CLPRELULAYER_H -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/runtime/CL/ICLOperator.h" +#include "arm_compute/runtime/IFunction.h" namespace arm_compute { class ICLTensor; +namespace experimental +{ /** Basic function to run @ref CLArithmeticOperationKernel for PRELU * * @note The function implements an activation layer with the PRELU activation function. */ -class CLPReluLayer : public ICLSimpleFunction +class CLPReluLayer : public ICLOperator { public: + /** Default Constructor */ + CLPReluLayer(); + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLPReluLayer + * + * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output); + + // Inherited methods overridden: + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; + +private: + CLFillBorderKernel _border_handler; +}; +} // namespace experimental + +/** Basic function to run @ref CLArithmeticOperationKernel for PRELU + * + * @note The function implements an activation layer with the PRELU activation function. + */ +class CLPReluLayer : public IFunction +{ +public: + /** Default Constructor */ + CLPReluLayer(); + /** Default Destructor */ + ~CLPReluLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPReluLayer(const CLPReluLayer &) = delete; + /** Default move constructor */ + CLPReluLayer(CLPReluLayer &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPReluLayer &operator=(const CLPReluLayer &) = delete; + /** Default move assignment operator */ + CLPReluLayer &operator=(CLPReluLayer &&); /** Set the input and output tensor. * * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place @@ -66,6 +118,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLPRELULAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h index 97ae9878ea..0aea91ae8e 100644 --- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h @@ -25,12 +25,12 @@ #define ARM_COMPUTE_CLQLSTMLAYER_H #include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" +#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/CL/functions/CLTranspose.h" @@ -48,7 +48,7 @@ class ICLTensor; * * -# @ref CLActivationLayer Activation functions (tanh and logistic) * -# @ref CLCopyKernel Copy kernel for copying output_state_out to output - * -# @ref CLSaturatedArithmeticOperationKernel Elementwise addition and subtraction + * -# @ref CLArithmeticAddition Elementwise addition and subtraction * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 * -# @ref CLGEMMLowpMatrixAReductionKernel For precomputing effective biases to use @@ -285,70 +285,70 @@ private: }; // Functions used - CLTranspose _transpose_input_to_forget_weights{}; - CLTranspose _transpose_input_to_cell_weights{}; - CLTranspose _transpose_input_to_output_weights{}; - CLTranspose _transpose_input_to_input_weights{}; - CLTranspose _transpose_recurrent_to_forget_weights{}; - CLTranspose _transpose_recurrent_to_cell_weights{}; - CLTranspose _transpose_recurrent_to_output_weights{}; - CLTranspose _transpose_recurrent_to_input_weights{}; - CLTranspose _transpose_projection_weights{}; - CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{}; - CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{}; - CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{}; - CLGEMMLowpMatrixAReductionKernel _projection_reduction{}; - CLSaturatedArithmeticOperationKernel _projection_bias_add{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; - CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_forget{}; - CLGEMMLowpOutputStage _input_to_forget_outstage{}; - CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; - CLGEMMLowpOutputStage _cell_to_forget_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_forget{}; - CLSaturatedArithmeticOperationKernel _accumulate_cell_forget{}; - CLActivationLayer _forget_gate_sigmoid{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; - CLGEMMLowpOutputStage _input_to_cell_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; - CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_modulation{}; - CLActivationLayer _cell_gate_tanh{}; - CLSaturatedArithmeticOperationKernel _input_gate_sub{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; - CLGEMMLowpOutputStage _input_to_input_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; - CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_input{}; - CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_input{}; - CLGEMMLowpOutputStage _cell_to_input_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_cell_input{}; - CLActivationLayer _input_gate_sigmoid{}; - CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_cell{}; - CLPixelWiseMultiplicationKernel _pixelwise_mul_input_cell{}; - CLSaturatedArithmeticOperationKernel _add_forget_cell{}; - CLActivationLayer _cell_clip{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; - CLGEMMLowpOutputStage _input_to_output_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; - CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_output{}; - CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_output{}; - CLGEMMLowpOutputStage _cell_to_output_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_cell_to_output{}; - CLActivationLayer _output_gate_sigmoid{}; - CLActivationLayer _hidden_tanh{}; - CLPixelWiseMultiplicationKernel _pixelwise_mul_hidden{}; - CLGEMMLowpOutputStage _hidden_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_projection{}; - CLGEMMLowpOutputStage _projection_outstage{}; - CLSaturatedArithmeticOperationKernel _accumulate_projection{}; - CLActivationLayer _projection_clip{}; + CLTranspose _transpose_input_to_forget_weights{}; + CLTranspose _transpose_input_to_cell_weights{}; + CLTranspose _transpose_input_to_output_weights{}; + CLTranspose _transpose_input_to_input_weights{}; + CLTranspose _transpose_recurrent_to_forget_weights{}; + CLTranspose _transpose_recurrent_to_cell_weights{}; + CLTranspose _transpose_recurrent_to_output_weights{}; + CLTranspose _transpose_recurrent_to_input_weights{}; + CLTranspose _transpose_projection_weights{}; + CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{}; + CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{}; + CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{}; + CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{}; + CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{}; + CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{}; + CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{}; + CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{}; + CLGEMMLowpMatrixAReductionKernel _projection_reduction{}; + CLArithmeticAddition _projection_bias_add{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; + CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_forget{}; + CLGEMMLowpOutputStage _input_to_forget_outstage{}; + CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; + CLGEMMLowpOutputStage _cell_to_forget_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_forget{}; + CLArithmeticAddition _accumulate_cell_forget{}; + CLActivationLayer _forget_gate_sigmoid{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; + CLGEMMLowpOutputStage _input_to_cell_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; + CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_modulation{}; + CLActivationLayer _cell_gate_tanh{}; + CLArithmeticSubtraction _input_gate_sub{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; + CLGEMMLowpOutputStage _input_to_input_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; + CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_input{}; + CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_input{}; + CLGEMMLowpOutputStage _cell_to_input_outstage{}; + CLArithmeticAddition _accumulate_cell_input{}; + CLActivationLayer _input_gate_sigmoid{}; + CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_cell{}; + CLPixelWiseMultiplicationKernel _pixelwise_mul_input_cell{}; + CLArithmeticAddition _add_forget_cell{}; + CLActivationLayer _cell_clip{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; + CLGEMMLowpOutputStage _input_to_output_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; + CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_output{}; + CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_output{}; + CLGEMMLowpOutputStage _cell_to_output_outstage{}; + CLArithmeticAddition _accumulate_cell_to_output{}; + CLActivationLayer _output_gate_sigmoid{}; + CLActivationLayer _hidden_tanh{}; + CLPixelWiseMultiplicationKernel _pixelwise_mul_hidden{}; + CLGEMMLowpOutputStage _hidden_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_projection{}; + CLGEMMLowpOutputStage _projection_outstage{}; + CLArithmeticAddition _accumulate_projection{}; + CLActivationLayer _projection_clip{}; std::array _layer_norms{ {} }; CLCopyKernel _copy_output{}; @@ -358,7 +358,10 @@ private: TensorCopyKernel _hidden_to_output_copy{}; // Tensor pointers - const ICLTensor *_input_to_input_weights{ nullptr }; + const ICLTensor *_input_to_input_weights + { + nullptr + }; const ICLTensor *_recurrent_to_input_weights{ nullptr }; const ICLTensor *_projection_bias{ nullptr }; const ICLTensor *_input_to_forget_weights{ nullptr }; diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h index 81f7810edd..9d1cb1a724 100644 --- a/arm_compute/runtime/CL/functions/CLRNNLayer.h +++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h @@ -25,9 +25,9 @@ #define ARM_COMPUTE_CLRNN_LAYER_H #include "arm_compute/core/CL/kernels/CLCopyKernel.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" +#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" @@ -85,16 +85,16 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - CLGEMM _gemm_state_f; - CLSaturatedArithmeticOperationKernel _add_kernel; - CLActivationLayer _activation; - CLFullyConnectedLayer _fully_connected_kernel; - CLCopyKernel _copy_kernel; - CLTensor _fully_connected_out; - CLTensor _gemm_output; - CLTensor _add_output; - bool _is_prepared; + MemoryGroup _memory_group; + CLGEMM _gemm_state_f; + CLArithmeticAddition _add_kernel; + CLActivationLayer _activation; + CLFullyConnectedLayer _fully_connected_kernel; + CLCopyKernel _copy_kernel; + CLTensor _fully_connected_out; + CLTensor _gemm_output; + CLTensor _add_output; + bool _is_prepared; }; } #endif /* ARM_COMPUTE_CLRNN_LAYER_H */ -- cgit v1.2.1