From 6eb73458c4869165c88d33c6a745a91cdc73a36a Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 2 Jul 2020 17:39:25 +0100 Subject: COMPMID-3373: Async support to NEArithmetic* kernels/functions (Pt. 2) Signed-off-by: Michalis Spyrou Change-Id: Iec06adb535aaf7efb1838d921e8d6bb978b7b215 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3498 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 33 +--- arm_compute/runtime/NEON/functions/NELSTMLayer.h | 168 ++++++++++----------- .../runtime/NEON/functions/NENormalizationLayer.h | 12 +- .../NEON/functions/NEPixelWiseMultiplication.h | 139 ++++++++++++++++- arm_compute/runtime/NEON/functions/NEQLSTMLayer.h | 16 +- .../kernels/NEPixelWiseMultiplicationKernel.cpp | 74 ++++----- src/runtime/NEON/functions/NELSTMLayer.cpp | 48 +++--- .../NEON/functions/NENormalizationLayer.cpp | 9 +- .../NEON/functions/NEPixelWiseMultiplication.cpp | 92 ++++++++++- src/runtime/NEON/functions/NEQLSTMLayer.cpp | 38 ++--- 10 files changed, 407 insertions(+), 222 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index 3cb0874a2f..5483fae565 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -72,7 +72,7 @@ public: * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16. * @param[in] rounding_policy Rounding policy. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. @@ -98,8 +98,8 @@ public: */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; + // Inherited methods overridden + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override; private: /** Common signature for all the specialised multiplication functions with integer scaling factor @@ -136,11 +136,8 @@ private: MulFunctionQuantized *_func_quantized; private: - const ITensor *_input1; - const ITensor *_input2; - ITensor *_output; - float _scale; - int _scale_exponent; + float _scale; + int _scale_exponent; }; /** Interface for the complex pixelwise multiplication kernel. */ @@ -151,23 +148,13 @@ public: { return "NEComplexPixelWiseMultiplicationKernel"; } - /** Default constructor.*/ - NEComplexPixelWiseMultiplicationKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEComplexPixelWiseMultiplicationKernel(const NEComplexPixelWiseMultiplicationKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEComplexPixelWiseMultiplicationKernel &operator=(const NEComplexPixelWiseMultiplicationKernel &) = delete; - /** Allow instances of this class to be moved */ - NEComplexPixelWiseMultiplicationKernel(NEComplexPixelWiseMultiplicationKernel &&) = default; - /** Allow instances of this class to be moved */ - NEComplexPixelWiseMultiplicationKernel &operator=(NEComplexPixelWiseMultiplicationKernel &&) = default; /** Initialise the kernel's input, output and border mode. * * @param[in] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). * @param[in] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1. * @param[out] output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output); + void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel * * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). @@ -179,13 +166,7 @@ public: static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - const ITensor *_input1; - const ITensor *_input2; - ITensor *_output; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index b9b581c484..2e2de61c95 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -26,7 +26,6 @@ #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" #include "arm_compute/core/NEON/kernels/NECopyKernel.h" -#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" @@ -36,6 +35,7 @@ #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/runtime/common/LSTMParams.h" namespace arm_compute @@ -146,89 +146,89 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - NEFullyConnectedLayer _fully_connected_input_gate; - NEArithmeticAddition _accum_input_gate1; - NEArithmeticSubtraction _subtract_input_gate; - NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate; - NEActivationLayer _activation_input_gate; - NEFullyConnectedLayer _fully_connected_forget_gate; - NEArithmeticAddition _accum_forget_gate1; - NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate; - NEActivationLayer _activation_forget_gate; - NEFullyConnectedLayer _fully_connected_cell_state; - NEGEMM _gemm_cell_state1; - NETransposeKernel _transpose_cell_state; - NEArithmeticAddition _accum_cell_state1; - NEArithmeticAddition _accum_cell_state2; - NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1; - NEActivationLayer _activation_cell_state; - NEActivationLayer _cell_clip; - NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2; - NEFullyConnectedLayer _fully_connected_output; - NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state1; - NEArithmeticAddition _accum_output1; - NEActivationLayer _activation_output; - NEActivationLayer _activation_output_state; - NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state2; - NEFullyConnectedLayer _fully_connected_output_state; - NEActivationLayer _projection_clip; - NECopyKernel _copy_cell_state; - NECopyKernel _copy_output; - NEConcatenateLayer _concat_scratch_buffer; - NEConcatenateLayer _concat_inputs_forget_gate; - NEConcatenateLayer _concat_weights_forget_gate; - NEConcatenateLayer _concat_weights_input_gate; - NEConcatenateLayer _concat_weights_output; - NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate; - NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff; - NEArithmeticAddition _accum_input_gate_bias; - NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; - NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff; - NEArithmeticAddition _accum_forget_gate_bias; - NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; - NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff; - NEArithmeticAddition _accum_cell_gate_bias; - NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate; - NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff; - NEArithmeticAddition _accum_output_gate_bias; - Tensor _input_gate_out1; - Tensor _input_gate_out2; - Tensor _input_gate_out3; - Tensor _input_gate_out4; - Tensor _forget_gate_out1; - Tensor _forget_gate_out2; - Tensor _forget_gate_out3; - Tensor _forget_gate_out4; - Tensor _forget_gate_out5; - Tensor _forget_gate_out6; - Tensor _cell_state_out1; - Tensor _cell_state_out2; - Tensor _cell_state_out3; - Tensor _cell_state_out4; - Tensor _cell_state_out5; - Tensor _output1; - Tensor _output2; - Tensor _output3; - Tensor _output4; - Tensor _cell_state_activation; - Tensor _output_state1; - Tensor _ones; - Tensor _input_layer_norm_out1; - Tensor _input_layer_norm_out2; - Tensor _forget_layer_norm_out1; - Tensor _forget_layer_norm_out2; - Tensor _cell_layer_norm_out1; - Tensor _cell_layer_norm_out2; - Tensor _output_layer_norm_out1; - Tensor _output_layer_norm_out2; - bool _run_peephole_opt; - bool _run_cifg_opt; - bool _perform_cell_clipping; - bool _has_projection_weights; - bool _perform_projection_clipping; - bool _is_prepared; - bool _is_layer_norm_lstm; + MemoryGroup _memory_group; + NEFullyConnectedLayer _fully_connected_input_gate; + NEArithmeticAddition _accum_input_gate1; + NEArithmeticSubtraction _subtract_input_gate; + NEPixelWiseMultiplication _pixelwise_mul_input_gate; + NEActivationLayer _activation_input_gate; + NEFullyConnectedLayer _fully_connected_forget_gate; + NEArithmeticAddition _accum_forget_gate1; + NEPixelWiseMultiplication _pixelwise_mul_forget_gate; + NEActivationLayer _activation_forget_gate; + NEFullyConnectedLayer _fully_connected_cell_state; + NEGEMM _gemm_cell_state1; + NETransposeKernel _transpose_cell_state; + NEArithmeticAddition _accum_cell_state1; + NEArithmeticAddition _accum_cell_state2; + NEPixelWiseMultiplication _pixelwise_mul_cell_state1; + NEActivationLayer _activation_cell_state; + NEActivationLayer _cell_clip; + NEPixelWiseMultiplication _pixelwise_mul_cell_state2; + NEFullyConnectedLayer _fully_connected_output; + NEPixelWiseMultiplication _pixelwise_mul_output_state1; + NEArithmeticAddition _accum_output1; + NEActivationLayer _activation_output; + NEActivationLayer _activation_output_state; + NEPixelWiseMultiplication _pixelwise_mul_output_state2; + NEFullyConnectedLayer _fully_connected_output_state; + NEActivationLayer _projection_clip; + NECopyKernel _copy_cell_state; + NECopyKernel _copy_output; + NEConcatenateLayer _concat_scratch_buffer; + NEConcatenateLayer _concat_inputs_forget_gate; + NEConcatenateLayer _concat_weights_forget_gate; + NEConcatenateLayer _concat_weights_input_gate; + NEConcatenateLayer _concat_weights_output; + NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate; + NEPixelWiseMultiplication _pixelwise_mul_input_gate_coeff; + NEArithmeticAddition _accum_input_gate_bias; + NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate; + NEPixelWiseMultiplication _pixelwise_mul_forget_gate_coeff; + NEArithmeticAddition _accum_forget_gate_bias; + NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate; + NEPixelWiseMultiplication _pixelwise_mul_cell_gate_coeff; + NEArithmeticAddition _accum_cell_gate_bias; + NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate; + NEPixelWiseMultiplication _pixelwise_mul_output_gate_coeff; + NEArithmeticAddition _accum_output_gate_bias; + Tensor _input_gate_out1; + Tensor _input_gate_out2; + Tensor _input_gate_out3; + Tensor _input_gate_out4; + Tensor _forget_gate_out1; + Tensor _forget_gate_out2; + Tensor _forget_gate_out3; + Tensor _forget_gate_out4; + Tensor _forget_gate_out5; + Tensor _forget_gate_out6; + Tensor _cell_state_out1; + Tensor _cell_state_out2; + Tensor _cell_state_out3; + Tensor _cell_state_out4; + Tensor _cell_state_out5; + Tensor _output1; + Tensor _output2; + Tensor _output3; + Tensor _output4; + Tensor _cell_state_activation; + Tensor _output_state1; + Tensor _ones; + Tensor _input_layer_norm_out1; + Tensor _input_layer_norm_out2; + Tensor _forget_layer_norm_out1; + Tensor _forget_layer_norm_out2; + Tensor _cell_layer_norm_out1; + Tensor _cell_layer_norm_out2; + Tensor _output_layer_norm_out1; + Tensor _output_layer_norm_out2; + bool _run_peephole_opt; + bool _run_cifg_opt; + bool _perform_cell_clipping; + bool _has_projection_weights; + bool _perform_projection_clipping; + bool _is_prepared; + bool _is_layer_norm_lstm; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NELSTMLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index 8683e44d3c..bead01457f 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -28,10 +28,10 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/runtime/Tensor.h" #include @@ -42,7 +42,7 @@ class ITensor; /** Basic function to compute a normalization layer. This function calls the following NEON kernels: * - * -# @ref NEPixelWiseMultiplicationKernel + * -# @ref NEPixelWiseMultiplication * -# @ref NEFillBorderKernel * -# @ref NENormalizationLayerKernel * @@ -75,10 +75,10 @@ public: void run() override; private: - MemoryGroup _memory_group; /**< Function memory group */ - NENormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel */ - NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */ - Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ + MemoryGroup _memory_group; /**< Function memory group */ + NENormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel */ + NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */ + Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ }; } #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index d84dff2c13..3b1209356a 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -25,15 +25,17 @@ #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunction.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/NEON/INEOperator.h" namespace arm_compute { class ITensor; +namespace experimental +{ /** Basic function to run @ref NEPixelWiseMultiplicationKernel */ -class NEPixelWiseMultiplication : public INESimpleFunctionNoBorder +class NEPixelWiseMultiplication : public INEOperator { public: /** Initialise the kernel's inputs, output and convertion policy. @@ -60,7 +62,7 @@ public: * @param[in] rounding_policy Rounding policy. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication * @@ -88,12 +90,132 @@ public: */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + MemoryRequirements workspace() const override; }; /** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */ -class NEComplexPixelWiseMultiplication : public INESimpleFunction +class NEComplexPixelWiseMultiplication : public INEOperator { public: + /** Initialise the kernel's inputs, output. + * + * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication + * + * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. + * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + MemoryRequirements workspace() const override; +}; +} // namespace experimental + +/** Basic function to run @ref NEPixelWiseMultiplicationKernel */ +class NEPixelWiseMultiplication : public IFunction +{ +public: + /** Default Constructor */ + NEPixelWiseMultiplication(); + /** Default Destructor */ + ~NEPixelWiseMultiplication(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete; + /** Default move constructor */ + NEPixelWiseMultiplication(NEPixelWiseMultiplication &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete; + /** Default move assignment operator */ + NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&); + /** Initialise the kernel's inputs, output and convertion policy. + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32 + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output Output tensor. Data types supported: + * - U8, only if both inputs are U8. + * - QASYMM8, only if both inputs are QASYMM8. + * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED. + * - S16. + * - QSYMM16, only if both inputs are QSYMM16. + * - S32, only if both inputs are QSYMM16. + * - F16, only if @p input1 is F16. + * - F32, only if both inputs are F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16. + * @param[in] rounding_policy Rounding policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32 + * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output Output tensor info. Data types supported: + * - U8, only if both inputs are U8. + * - QASYMM8, only if both inputs are QASYMM8. + * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED. + * - S16. + * - QSYMM16, only if both inputs are QSYMM16. + * - S32, only if both inputs are QSYMM16. + * - F16, only if @p input1 is F16. + * - F32, only if both inputs are F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16. + * @param[in] rounding_policy Rounding policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; +}; + +/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */ +class NEComplexPixelWiseMultiplication : public IFunction +{ +public: + /** Default Constructor */ + NEComplexPixelWiseMultiplication(); + /** Default Destructor */ + ~NEComplexPixelWiseMultiplication(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete; + /** Default move constructor */ + NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete; + /** Default move assignment operator */ + NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&); /** Initialise the kernel's inputs, output. * * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). @@ -112,6 +234,13 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr _impl; }; } #endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */ diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index 60c8fa1226..a19310d8ea 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -26,7 +26,6 @@ #include "arm_compute/core/NEON/kernels/NECopyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" -#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" #include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" @@ -34,6 +33,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/runtime/common/LSTMParams.h" @@ -54,7 +54,7 @@ class ITensor; * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 * -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use - * -# @ref NEPixelWiseMultiplicationKernel Elementwise multiplication + * -# @ref NEPixelWiseMultiplication Elementwise multiplication * -# @ref NETranspose Transpose function for reshaping the weights * */ class NEQLSTMLayer : public IFunction @@ -257,7 +257,7 @@ private: NEArithmeticAddition _projection_bias_add{}; NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; - NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_forget{}; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; NEGEMMLowpOutputStage _input_to_forget_outstage{}; NEGEMMLowpOutputStage _recurrent_to_forget_outstage{}; NEGEMMLowpOutputStage _cell_to_forget_outstage{}; @@ -276,12 +276,12 @@ private: NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; NEGEMMLowpOutputStage _recurrent_to_input_outstage{}; NEArithmeticAddition _accumulate_input_recurrent_input{}; - NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_input{}; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; NEGEMMLowpOutputStage _cell_to_input_outstage{}; NEArithmeticAddition _accumulate_cell_input{}; NEActivationLayer _input_gate_sigmoid{}; - NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_cell{}; - NEPixelWiseMultiplicationKernel _pixelwise_mul_input_cell{}; + NEPixelWiseMultiplication _pixelwise_mul_forget_cell{}; + NEPixelWiseMultiplication _pixelwise_mul_input_cell{}; NEArithmeticAddition _add_forget_cell{}; NEActivationLayer _cell_clip{}; NEGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; @@ -289,12 +289,12 @@ private: NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; NEGEMMLowpOutputStage _recurrent_to_output_outstage{}; NEArithmeticAddition _accumulate_input_recurrent_output{}; - NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_output{}; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; NEGEMMLowpOutputStage _cell_to_output_outstage{}; NEArithmeticAddition _accumulate_cell_to_output{}; NEActivationLayer _output_gate_sigmoid{}; NEActivationLayer _hidden_tanh{}; - NEPixelWiseMultiplicationKernel _pixelwise_mul_hidden{}; + NEPixelWiseMultiplication _pixelwise_mul_hidden{}; NEGEMMLowpOutputStage _hidden_outstage{}; NEGEMMLowpMatrixMultiplyCore _mm_projection{}; NEGEMMLowpOutputStage _projection_outstage{}; diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index b23a20d019..cd1c4b28cc 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -1060,27 +1060,24 @@ void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const } // namespace NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel() - : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 } + : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 } { } -void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) { ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy)); - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); const TensorShape &out_shape = broadcast_pair.first; const ValidRegion &valid_region = broadcast_pair.second; // Auto initialize output if not initialized - set_shape_if_empty(*output->info(), out_shape); + set_shape_if_empty(*output, out_shape); - _input1 = input1; - _input2 = input2; - _output = output; _scale = scale; _scale_exponent = 0; _func_quantized = nullptr; @@ -1104,9 +1101,9 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe _scale_exponent = std::abs(exponent - 1); } - const DataType dt_input1 = input1->info()->data_type(); - const DataType dt_input2 = input2->info()->data_type(); - const DataType dt_output = output->info()->data_type(); + const DataType dt_input1 = input1->data_type(); + const DataType dt_input2 = input2->data_type(); + const DataType dt_output = output->data_type(); const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); switch(dt_input1) @@ -1207,8 +1204,8 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe // Configure kernel window Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(valid_region); + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(valid_region); Window win = calculate_max_window(valid_region, Steps()); INEKernel::configure(win); @@ -1223,27 +1220,30 @@ Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, cons return Status{}; } -void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info) +void NEPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + auto input1 = inputs.at(TensorType::ACL_SRC_0); + auto input2 = inputs.at(TensorType::ACL_SRC_1); + auto output = outputs.at(TensorType::ACL_DST); + if(_func_quantized != nullptr) { - (*_func_quantized)(_input1, _input2, _output, window, _scale); + (*_func_quantized)(input1, input2, output, window, _scale); } else if(_func_int != nullptr) { - (*_func_int)(_input1, _input2, _output, window, _scale_exponent); + (*_func_int)(input1, input2, output, window, _scale_exponent); } else { ARM_COMPUTE_ERROR_ON(_func_float == nullptr); - (*_func_float)(_input1, _input2, _output, window, _scale); + (*_func_float)(input1, input2, output, window, _scale); } } - namespace { constexpr unsigned int num_elems_processed_per_iteration_complex = 2; @@ -1296,24 +1296,15 @@ std::pair validate_and_configure_window_complex(ITensorInfo *inp } } // namespace -NEComplexPixelWiseMultiplicationKernel::NEComplexPixelWiseMultiplicationKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void NEComplexPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output)); // Configure kernel window - auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info()); + auto win_config = validate_and_configure_window_complex(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - _input1 = input1; - _input2 = input2; - _output = output; - // Create kernel INEKernel::configure(win_config.second); } @@ -1327,27 +1318,24 @@ Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input return Status{}; } -void NEComplexPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info) +void NEComplexPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape())); - Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape())); - Iterator output(_output, window); + auto input1 = inputs.at(TensorType::ACL_SRC_0); + auto input2 = inputs.at(TensorType::ACL_SRC_1); + auto output = outputs.at(TensorType::ACL_DST); + + Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape())); + Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape())); + Iterator output_it(output, window); execute_window_loop(window, [&](const Coordinates &) { - c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr()); + c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr()); }, - input1, input2, output); -} - -BorderSize NEComplexPixelWiseMultiplicationKernel::border_size() const -{ - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min(num_elems_processed_per_iteration_complex - 1U, replicateSize); - return { 0, border, 0, 0 }; + input1_it, input2_it, output_it); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index d8c684bf15..467c51b1a6 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -464,14 +464,14 @@ Status NELSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } if(lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); @@ -498,14 +498,14 @@ Status NELSTMLayer::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } if(lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); @@ -522,13 +522,13 @@ Status NELSTMLayer::validate(const ITensorInfo *input, if(lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); if(cell_threshold != 0.f) { @@ -548,22 +548,22 @@ Status NELSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); } if(lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); if(lstm_params.has_projection()) { ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); @@ -603,13 +603,13 @@ void NELSTMLayer::run() if(_run_peephole_opt) { - NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY); + _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } if(_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY); + _pixelwise_mul_forget_gate_coeff.run(); _accum_forget_gate_bias.run(); } _activation_forget_gate.run(); @@ -632,14 +632,14 @@ void NELSTMLayer::run() if(_run_peephole_opt) { - NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY); + _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } if(_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY); + _pixelwise_mul_input_gate_coeff.run(); _accum_input_gate_bias.run(); } _activation_input_gate.run(); @@ -652,12 +652,12 @@ void NELSTMLayer::run() if(_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY); + _pixelwise_mul_cell_gate_coeff.run(); _accum_cell_gate_bias.run(); } _activation_cell_state.run(); - NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY); - NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY); + _pixelwise_mul_cell_state1.run(); + _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); if(_perform_cell_clipping) @@ -668,19 +668,19 @@ void NELSTMLayer::run() _fully_connected_output.run(); if(_run_peephole_opt) { - NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY); + _pixelwise_mul_output_state1.run(); _accum_output1.run(); } if(_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); - NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY); + _pixelwise_mul_output_gate_coeff.run(); _accum_output_gate_bias.run(); } _activation_output.run(); _activation_output_state.run(); - NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY); + _pixelwise_mul_output_state2.run(); if(_has_projection_weights) { diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index f3a3ac6322..ab8cb656bd 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -33,7 +33,7 @@ namespace arm_compute { NENormalizationLayer::NENormalizationLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _input_squared() + : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared() { } @@ -49,7 +49,7 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons // Configure kernels _norm_kernel.configure(input, &_input_squared, output, norm_info); - _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); // Allocate the tensor once the configure methods have been called _input_squared.allocator()->allocate(); @@ -61,7 +61,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); return Status{}; } @@ -69,8 +69,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf void NENormalizationLayer::run() { MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(&_multiply_kernel, Window::DimY); + _multiply_f.run(); NEScheduler::get().schedule(&_norm_kernel, Window::DimY); } } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index 95bc08a5dd..aebb8cab35 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -31,7 +31,9 @@ namespace arm_compute { -void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +namespace experimental +{ +void NEPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); @@ -46,7 +48,12 @@ Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy); } -void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +MemoryRequirements NEPixelWiseMultiplication::workspace() const +{ + return MemoryRequirements{}; +} + +void NEComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); auto k = arm_compute::support::cpp14::make_unique(); @@ -60,4 +67,85 @@ Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, con return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output); } +MemoryRequirements NEComplexPixelWiseMultiplication::workspace() const +{ + return MemoryRequirements{}; +} +} // namespace experimental + +struct NEPixelWiseMultiplication::Impl +{ + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +NEPixelWiseMultiplication::NEPixelWiseMultiplication() + : _impl(support::cpp14::make_unique()) +{ +} +NEPixelWiseMultiplication::NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default; +NEPixelWiseMultiplication &NEPixelWiseMultiplication::operator=(NEPixelWiseMultiplication &&) = default; +NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; + +Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + return experimental::NEPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); +} + +void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = arm_compute::support::cpp14::make_unique(); + _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); +} + +void NEPixelWiseMultiplication::run() +{ + const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } }; + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + _impl->op->run(src, dst, {}); +} + +struct NEComplexPixelWiseMultiplication::Impl +{ + ITensor *src_0{ nullptr }; + ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() + : _impl(support::cpp14::make_unique()) +{ +} +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default; +NEComplexPixelWiseMultiplication &NEComplexPixelWiseMultiplication::operator=(NEComplexPixelWiseMultiplication &&) = default; +NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; + +Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +{ + return experimental::NEComplexPixelWiseMultiplication::validate(input1, input2, output, act_info); +} + +void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = arm_compute::support::cpp14::make_unique(); + _impl->op->configure(input1->info(), input2->info(), output->info(), act_info); +} + +void NEComplexPixelWiseMultiplication::run() +{ + const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } }; + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + _impl->op->run(src, dst, {}); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 6eb1844a1f..018d0f4d0e 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -357,7 +357,7 @@ void NEQLSTMLayer::configure(const ITensor *input, input_activation_input->allocator()->allocate(); } // Cell. - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); @@ -392,7 +392,7 @@ void NEQLSTMLayer::configure(const ITensor *input, if(_has_peephole) { - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); @@ -426,7 +426,7 @@ void NEQLSTMLayer::configure(const ITensor *input, // Hidden. _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); @@ -667,8 +667,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); @@ -737,8 +737,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); @@ -755,8 +755,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); if(quantized_cell_clip > 0) { @@ -776,12 +776,12 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); - // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel + // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); } @@ -799,7 +799,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); @@ -897,7 +897,7 @@ void NEQLSTMLayer::run() if(_has_peephole) { - NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY); + _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } @@ -939,7 +939,7 @@ void NEQLSTMLayer::run() if(_has_peephole) { - NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY); + _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } @@ -953,8 +953,8 @@ void NEQLSTMLayer::run() } // Cell. - NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY); - NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY); + _pixelwise_mul_forget_cell.run(); + _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); if(_has_cell_clipping) @@ -970,7 +970,7 @@ void NEQLSTMLayer::run() _accumulate_input_recurrent_output.run(); if(_has_peephole) { - NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY); + _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } @@ -984,7 +984,7 @@ void NEQLSTMLayer::run() // Hidden. _hidden_tanh.run(); - NEScheduler::get().schedule(&_pixelwise_mul_hidden, Window::DimY); + _pixelwise_mul_hidden.run(); _hidden_outstage.run(); // Projection. -- cgit v1.2.1