aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2020-07-02 17:39:25 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2020-07-07 09:14:00 +0000
commit6eb73458c4869165c88d33c6a745a91cdc73a36a (patch)
tree1f22bd141f420ad4e2906939bb4abf11fec3aea3
parent1fad814022ba98506ba30b2e25601985e7ec5259 (diff)
downloadComputeLibrary-6eb73458c4869165c88d33c6a745a91cdc73a36a.tar.gz
COMPMID-3373: Async support to NEArithmetic* kernels/functions (Pt. 2)
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Change-Id: Iec06adb535aaf7efb1838d921e8d6bb978b7b215 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3498 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h33
-rw-r--r--arm_compute/runtime/NEON/functions/NELSTMLayer.h168
-rw-r--r--arm_compute/runtime/NEON/functions/NENormalizationLayer.h12
-rw-r--r--arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h139
-rw-r--r--arm_compute/runtime/NEON/functions/NEQLSTMLayer.h16
-rw-r--r--src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp74
-rw-r--r--src/runtime/NEON/functions/NELSTMLayer.cpp48
-rw-r--r--src/runtime/NEON/functions/NENormalizationLayer.cpp9
-rw-r--r--src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp92
-rw-r--r--src/runtime/NEON/functions/NEQLSTMLayer.cpp38
10 files changed, 407 insertions, 222 deletions
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 3cb0874a2f..5483fae565 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -72,7 +72,7 @@ public:
* @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
* @param[in] rounding_policy Rounding policy.
*/
- void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+ void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
/** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
@@ -98,8 +98,8 @@ public:
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
+ // Inherited methods overridden
+ void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override;
private:
/** Common signature for all the specialised multiplication functions with integer scaling factor
@@ -136,11 +136,8 @@ private:
MulFunctionQuantized *_func_quantized;
private:
- const ITensor *_input1;
- const ITensor *_input2;
- ITensor *_output;
- float _scale;
- int _scale_exponent;
+ float _scale;
+ int _scale_exponent;
};
/** Interface for the complex pixelwise multiplication kernel. */
@@ -151,23 +148,13 @@ public:
{
return "NEComplexPixelWiseMultiplicationKernel";
}
- /** Default constructor.*/
- NEComplexPixelWiseMultiplicationKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEComplexPixelWiseMultiplicationKernel(const NEComplexPixelWiseMultiplicationKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEComplexPixelWiseMultiplicationKernel &operator=(const NEComplexPixelWiseMultiplicationKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEComplexPixelWiseMultiplicationKernel(NEComplexPixelWiseMultiplicationKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEComplexPixelWiseMultiplicationKernel &operator=(NEComplexPixelWiseMultiplicationKernel &&) = default;
/** Initialise the kernel's input, output and border mode.
*
* @param[in] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
* @param[in] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
* @param[out] output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
*/
- void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+ void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
*
* @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -179,13 +166,7 @@ public:
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
// Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
- BorderSize border_size() const override;
-
-private:
- const ITensor *_input1;
- const ITensor *_input2;
- ITensor *_output;
+ void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override;
};
} // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index b9b581c484..2e2de61c95 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -26,7 +26,6 @@
#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
@@ -36,6 +35,7 @@
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
#include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/runtime/common/LSTMParams.h"
namespace arm_compute
@@ -146,89 +146,89 @@ public:
void prepare() override;
private:
- MemoryGroup _memory_group;
- NEFullyConnectedLayer _fully_connected_input_gate;
- NEArithmeticAddition _accum_input_gate1;
- NEArithmeticSubtraction _subtract_input_gate;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate;
- NEActivationLayer _activation_input_gate;
- NEFullyConnectedLayer _fully_connected_forget_gate;
- NEArithmeticAddition _accum_forget_gate1;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate;
- NEActivationLayer _activation_forget_gate;
- NEFullyConnectedLayer _fully_connected_cell_state;
- NEGEMM _gemm_cell_state1;
- NETransposeKernel _transpose_cell_state;
- NEArithmeticAddition _accum_cell_state1;
- NEArithmeticAddition _accum_cell_state2;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1;
- NEActivationLayer _activation_cell_state;
- NEActivationLayer _cell_clip;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2;
- NEFullyConnectedLayer _fully_connected_output;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state1;
- NEArithmeticAddition _accum_output1;
- NEActivationLayer _activation_output;
- NEActivationLayer _activation_output_state;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state2;
- NEFullyConnectedLayer _fully_connected_output_state;
- NEActivationLayer _projection_clip;
- NECopyKernel _copy_cell_state;
- NECopyKernel _copy_output;
- NEConcatenateLayer _concat_scratch_buffer;
- NEConcatenateLayer _concat_inputs_forget_gate;
- NEConcatenateLayer _concat_weights_forget_gate;
- NEConcatenateLayer _concat_weights_input_gate;
- NEConcatenateLayer _concat_weights_output;
- NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff;
- NEArithmeticAddition _accum_input_gate_bias;
- NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff;
- NEArithmeticAddition _accum_forget_gate_bias;
- NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff;
- NEArithmeticAddition _accum_cell_gate_bias;
- NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
- NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff;
- NEArithmeticAddition _accum_output_gate_bias;
- Tensor _input_gate_out1;
- Tensor _input_gate_out2;
- Tensor _input_gate_out3;
- Tensor _input_gate_out4;
- Tensor _forget_gate_out1;
- Tensor _forget_gate_out2;
- Tensor _forget_gate_out3;
- Tensor _forget_gate_out4;
- Tensor _forget_gate_out5;
- Tensor _forget_gate_out6;
- Tensor _cell_state_out1;
- Tensor _cell_state_out2;
- Tensor _cell_state_out3;
- Tensor _cell_state_out4;
- Tensor _cell_state_out5;
- Tensor _output1;
- Tensor _output2;
- Tensor _output3;
- Tensor _output4;
- Tensor _cell_state_activation;
- Tensor _output_state1;
- Tensor _ones;
- Tensor _input_layer_norm_out1;
- Tensor _input_layer_norm_out2;
- Tensor _forget_layer_norm_out1;
- Tensor _forget_layer_norm_out2;
- Tensor _cell_layer_norm_out1;
- Tensor _cell_layer_norm_out2;
- Tensor _output_layer_norm_out1;
- Tensor _output_layer_norm_out2;
- bool _run_peephole_opt;
- bool _run_cifg_opt;
- bool _perform_cell_clipping;
- bool _has_projection_weights;
- bool _perform_projection_clipping;
- bool _is_prepared;
- bool _is_layer_norm_lstm;
+ MemoryGroup _memory_group;
+ NEFullyConnectedLayer _fully_connected_input_gate;
+ NEArithmeticAddition _accum_input_gate1;
+ NEArithmeticSubtraction _subtract_input_gate;
+ NEPixelWiseMultiplication _pixelwise_mul_input_gate;
+ NEActivationLayer _activation_input_gate;
+ NEFullyConnectedLayer _fully_connected_forget_gate;
+ NEArithmeticAddition _accum_forget_gate1;
+ NEPixelWiseMultiplication _pixelwise_mul_forget_gate;
+ NEActivationLayer _activation_forget_gate;
+ NEFullyConnectedLayer _fully_connected_cell_state;
+ NEGEMM _gemm_cell_state1;
+ NETransposeKernel _transpose_cell_state;
+ NEArithmeticAddition _accum_cell_state1;
+ NEArithmeticAddition _accum_cell_state2;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_state1;
+ NEActivationLayer _activation_cell_state;
+ NEActivationLayer _cell_clip;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_state2;
+ NEFullyConnectedLayer _fully_connected_output;
+ NEPixelWiseMultiplication _pixelwise_mul_output_state1;
+ NEArithmeticAddition _accum_output1;
+ NEActivationLayer _activation_output;
+ NEActivationLayer _activation_output_state;
+ NEPixelWiseMultiplication _pixelwise_mul_output_state2;
+ NEFullyConnectedLayer _fully_connected_output_state;
+ NEActivationLayer _projection_clip;
+ NECopyKernel _copy_cell_state;
+ NECopyKernel _copy_output;
+ NEConcatenateLayer _concat_scratch_buffer;
+ NEConcatenateLayer _concat_inputs_forget_gate;
+ NEConcatenateLayer _concat_weights_forget_gate;
+ NEConcatenateLayer _concat_weights_input_gate;
+ NEConcatenateLayer _concat_weights_output;
+ NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
+ NEPixelWiseMultiplication _pixelwise_mul_input_gate_coeff;
+ NEArithmeticAddition _accum_input_gate_bias;
+ NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
+ NEPixelWiseMultiplication _pixelwise_mul_forget_gate_coeff;
+ NEArithmeticAddition _accum_forget_gate_bias;
+ NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
+ NEPixelWiseMultiplication _pixelwise_mul_cell_gate_coeff;
+ NEArithmeticAddition _accum_cell_gate_bias;
+ NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
+ NEPixelWiseMultiplication _pixelwise_mul_output_gate_coeff;
+ NEArithmeticAddition _accum_output_gate_bias;
+ Tensor _input_gate_out1;
+ Tensor _input_gate_out2;
+ Tensor _input_gate_out3;
+ Tensor _input_gate_out4;
+ Tensor _forget_gate_out1;
+ Tensor _forget_gate_out2;
+ Tensor _forget_gate_out3;
+ Tensor _forget_gate_out4;
+ Tensor _forget_gate_out5;
+ Tensor _forget_gate_out6;
+ Tensor _cell_state_out1;
+ Tensor _cell_state_out2;
+ Tensor _cell_state_out3;
+ Tensor _cell_state_out4;
+ Tensor _cell_state_out5;
+ Tensor _output1;
+ Tensor _output2;
+ Tensor _output3;
+ Tensor _output4;
+ Tensor _cell_state_activation;
+ Tensor _output_state1;
+ Tensor _ones;
+ Tensor _input_layer_norm_out1;
+ Tensor _input_layer_norm_out2;
+ Tensor _forget_layer_norm_out1;
+ Tensor _forget_layer_norm_out2;
+ Tensor _cell_layer_norm_out1;
+ Tensor _cell_layer_norm_out2;
+ Tensor _output_layer_norm_out1;
+ Tensor _output_layer_norm_out2;
+ bool _run_peephole_opt;
+ bool _run_cifg_opt;
+ bool _perform_cell_clipping;
+ bool _has_projection_weights;
+ bool _perform_projection_clipping;
+ bool _is_prepared;
+ bool _is_layer_norm_lstm;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NELSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index 8683e44d3c..bead01457f 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -28,10 +28,10 @@
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
@@ -42,7 +42,7 @@ class ITensor;
/** Basic function to compute a normalization layer. This function calls the following NEON kernels:
*
- * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEPixelWiseMultiplication
* -# @ref NEFillBorderKernel
* -# @ref NENormalizationLayerKernel
*
@@ -75,10 +75,10 @@ public:
void run() override;
private:
- MemoryGroup _memory_group; /**< Function memory group */
- NENormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel */
- NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
- Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
+ MemoryGroup _memory_group; /**< Function memory group */
+ NENormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel */
+ NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */
+ Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
};
}
#endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index d84dff2c13..3b1209356a 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -25,15 +25,17 @@
#define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
namespace arm_compute
{
class ITensor;
+namespace experimental
+{
/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
-class NEPixelWiseMultiplication : public INESimpleFunctionNoBorder
+class NEPixelWiseMultiplication : public INEOperator
{
public:
/** Initialise the kernel's inputs, output and convertion policy.
@@ -60,7 +62,7 @@ public:
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
*
@@ -88,10 +90,13 @@ public:
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ MemoryRequirements workspace() const override;
};
/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
-class NEComplexPixelWiseMultiplication : public INESimpleFunction
+class NEComplexPixelWiseMultiplication : public INEOperator
{
public:
/** Initialise the kernel's inputs, output.
@@ -103,6 +108,123 @@ public:
* @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
+ void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
+ *
+ * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ MemoryRequirements workspace() const override;
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
+class NEPixelWiseMultiplication : public IFunction
+{
+public:
+ /** Default Constructor */
+ NEPixelWiseMultiplication();
+ /** Default Destructor */
+ ~NEPixelWiseMultiplication();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete;
+ /** Default move constructor */
+ NEPixelWiseMultiplication(NEPixelWiseMultiplication &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete;
+ /** Default move assignment operator */
+ NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&);
+ /** Initialise the kernel's inputs, output and convertion policy.
+ *
+ * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+ * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+ *
+ * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+ * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output Output tensor. Data types supported:
+ * - U8, only if both inputs are U8.
+ * - QASYMM8, only if both inputs are QASYMM8.
+ * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
+ * - S16.
+ * - QSYMM16, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are QSYMM16.
+ * - F16, only if @p input1 is F16.
+ * - F32, only if both inputs are F32.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * @param[in] rounding_policy Rounding policy.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
+ *
+ * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+ * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+ *
+ * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+ * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * @param[in] output Output tensor info. Data types supported:
+ * - U8, only if both inputs are U8.
+ * - QASYMM8, only if both inputs are QASYMM8.
+ * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
+ * - S16.
+ * - QSYMM16, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are QSYMM16.
+ * - F16, only if @p input1 is F16.
+ * - F32, only if both inputs are F32.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * @param[in] rounding_policy Rounding policy.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
+class NEComplexPixelWiseMultiplication : public IFunction
+{
+public:
+ /** Default Constructor */
+ NEComplexPixelWiseMultiplication();
+ /** Default Destructor */
+ ~NEComplexPixelWiseMultiplication();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete;
+ /** Default move constructor */
+ NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete;
+ /** Default move assignment operator */
+ NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&);
+ /** Initialise the kernel's inputs, output.
+ *
+ * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
*
@@ -112,6 +234,13 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
}
#endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 60c8fa1226..a19310d8ea 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -26,7 +26,6 @@
#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
@@ -34,6 +33,7 @@
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
#include "arm_compute/runtime/common/LSTMParams.h"
@@ -54,7 +54,7 @@ class ITensor;
* -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
* -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
* -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
- * -# @ref NEPixelWiseMultiplicationKernel Elementwise multiplication
+ * -# @ref NEPixelWiseMultiplication Elementwise multiplication
* -# @ref NETranspose Transpose function for reshaping the weights
* */
class NEQLSTMLayer : public IFunction
@@ -257,7 +257,7 @@ private:
NEArithmeticAddition _projection_bias_add{};
NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget{};
NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{};
- NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_forget{};
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget{};
NEGEMMLowpOutputStage _input_to_forget_outstage{};
NEGEMMLowpOutputStage _recurrent_to_forget_outstage{};
NEGEMMLowpOutputStage _cell_to_forget_outstage{};
@@ -276,12 +276,12 @@ private:
NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{};
NEGEMMLowpOutputStage _recurrent_to_input_outstage{};
NEArithmeticAddition _accumulate_input_recurrent_input{};
- NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_input{};
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_input{};
NEGEMMLowpOutputStage _cell_to_input_outstage{};
NEArithmeticAddition _accumulate_cell_input{};
NEActivationLayer _input_gate_sigmoid{};
- NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_cell{};
- NEPixelWiseMultiplicationKernel _pixelwise_mul_input_cell{};
+ NEPixelWiseMultiplication _pixelwise_mul_forget_cell{};
+ NEPixelWiseMultiplication _pixelwise_mul_input_cell{};
NEArithmeticAddition _add_forget_cell{};
NEActivationLayer _cell_clip{};
NEGEMMLowpMatrixMultiplyCore _mm_input_to_output{};
@@ -289,12 +289,12 @@ private:
NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{};
NEGEMMLowpOutputStage _recurrent_to_output_outstage{};
NEArithmeticAddition _accumulate_input_recurrent_output{};
- NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_to_output{};
+ NEPixelWiseMultiplication _pixelwise_mul_cell_to_output{};
NEGEMMLowpOutputStage _cell_to_output_outstage{};
NEArithmeticAddition _accumulate_cell_to_output{};
NEActivationLayer _output_gate_sigmoid{};
NEActivationLayer _hidden_tanh{};
- NEPixelWiseMultiplicationKernel _pixelwise_mul_hidden{};
+ NEPixelWiseMultiplication _pixelwise_mul_hidden{};
NEGEMMLowpOutputStage _hidden_outstage{};
NEGEMMLowpMatrixMultiplyCore _mm_projection{};
NEGEMMLowpOutputStage _projection_outstage{};
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index b23a20d019..cd1c4b28cc 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -1060,27 +1060,24 @@ void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
} // namespace
NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
- : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+ : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
{
}
-void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
const TensorShape &out_shape = broadcast_pair.first;
const ValidRegion &valid_region = broadcast_pair.second;
// Auto initialize output if not initialized
- set_shape_if_empty(*output->info(), out_shape);
+ set_shape_if_empty(*output, out_shape);
- _input1 = input1;
- _input2 = input2;
- _output = output;
_scale = scale;
_scale_exponent = 0;
_func_quantized = nullptr;
@@ -1104,9 +1101,9 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
_scale_exponent = std::abs(exponent - 1);
}
- const DataType dt_input1 = input1->info()->data_type();
- const DataType dt_input2 = input2->info()->data_type();
- const DataType dt_output = output->info()->data_type();
+ const DataType dt_input1 = input1->data_type();
+ const DataType dt_input2 = input2->data_type();
+ const DataType dt_output = output->data_type();
const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
switch(dt_input1)
@@ -1207,8 +1204,8 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
// Configure kernel window
Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(valid_region);
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(valid_region);
Window win = calculate_max_window(valid_region, Steps());
INEKernel::configure(win);
@@ -1223,27 +1220,30 @@ Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, cons
return Status{};
}
-void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+void NEPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ auto input1 = inputs.at(TensorType::ACL_SRC_0);
+ auto input2 = inputs.at(TensorType::ACL_SRC_1);
+ auto output = outputs.at(TensorType::ACL_DST);
+
if(_func_quantized != nullptr)
{
- (*_func_quantized)(_input1, _input2, _output, window, _scale);
+ (*_func_quantized)(input1, input2, output, window, _scale);
}
else if(_func_int != nullptr)
{
- (*_func_int)(_input1, _input2, _output, window, _scale_exponent);
+ (*_func_int)(input1, input2, output, window, _scale_exponent);
}
else
{
ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
- (*_func_float)(_input1, _input2, _output, window, _scale);
+ (*_func_float)(input1, input2, output, window, _scale);
}
}
-
namespace
{
constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
@@ -1296,24 +1296,15 @@ std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *inp
}
} // namespace
-NEComplexPixelWiseMultiplicationKernel::NEComplexPixelWiseMultiplicationKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void NEComplexPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
// Configure kernel window
- auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+ auto win_config = validate_and_configure_window_complex(input1, input2, output);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
// Create kernel
INEKernel::configure(win_config.second);
}
@@ -1327,27 +1318,24 @@ Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input
return Status{};
}
-void NEComplexPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+void NEComplexPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape()));
- Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape()));
- Iterator output(_output, window);
+ auto input1 = inputs.at(TensorType::ACL_SRC_0);
+ auto input2 = inputs.at(TensorType::ACL_SRC_1);
+ auto output = outputs.at(TensorType::ACL_DST);
+
+ Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape()));
+ Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape()));
+ Iterator output_it(output, window);
execute_window_loop(window, [&](const Coordinates &)
{
- c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr());
+ c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr());
},
- input1, input2, output);
-}
-
-BorderSize NEComplexPixelWiseMultiplicationKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
- return { 0, border, 0, 0 };
+ input1_it, input2_it, output_it);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index d8c684bf15..467c51b1a6 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -464,14 +464,14 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
if(lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
if(lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
@@ -498,14 +498,14 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
}
if(lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
@@ -522,13 +522,13 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
if(lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
if(cell_threshold != 0.f)
{
@@ -548,22 +548,22 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
if(lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
}
if(lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate output state
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
if(lstm_params.has_projection())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
@@ -603,13 +603,13 @@ void NELSTMLayer::run()
if(_run_peephole_opt)
{
- NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY);
+ _pixelwise_mul_forget_gate.run();
_accum_forget_gate1.run();
}
if(_is_layer_norm_lstm)
{
_mean_std_norm_forget_gate.run();
- NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY);
+ _pixelwise_mul_forget_gate_coeff.run();
_accum_forget_gate_bias.run();
}
_activation_forget_gate.run();
@@ -632,14 +632,14 @@ void NELSTMLayer::run()
if(_run_peephole_opt)
{
- NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
+ _pixelwise_mul_input_gate.run();
_accum_input_gate1.run();
}
if(_is_layer_norm_lstm)
{
_mean_std_norm_input_gate.run();
- NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY);
+ _pixelwise_mul_input_gate_coeff.run();
_accum_input_gate_bias.run();
}
_activation_input_gate.run();
@@ -652,12 +652,12 @@ void NELSTMLayer::run()
if(_is_layer_norm_lstm)
{
_mean_std_norm_cell_gate.run();
- NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY);
+ _pixelwise_mul_cell_gate_coeff.run();
_accum_cell_gate_bias.run();
}
_activation_cell_state.run();
- NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY);
- NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY);
+ _pixelwise_mul_cell_state1.run();
+ _pixelwise_mul_cell_state2.run();
_accum_cell_state2.run();
if(_perform_cell_clipping)
@@ -668,19 +668,19 @@ void NELSTMLayer::run()
_fully_connected_output.run();
if(_run_peephole_opt)
{
- NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
+ _pixelwise_mul_output_state1.run();
_accum_output1.run();
}
if(_is_layer_norm_lstm)
{
_mean_std_norm_output_gate.run();
- NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY);
+ _pixelwise_mul_output_gate_coeff.run();
_accum_output_gate_bias.run();
}
_activation_output.run();
_activation_output_state.run();
- NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY);
+ _pixelwise_mul_output_state2.run();
if(_has_projection_weights)
{
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index f3a3ac6322..ab8cb656bd 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -33,7 +33,7 @@
namespace arm_compute
{
NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _input_squared()
+ : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared()
{
}
@@ -49,7 +49,7 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
// Configure kernels
_norm_kernel.configure(input, &_input_squared, output, norm_info);
- _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
// Allocate the tensor once the configure methods have been called
_input_squared.allocator()->allocate();
@@ -61,7 +61,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
return Status{};
}
@@ -69,8 +69,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
void NENormalizationLayer::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
-
- NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+ _multiply_f.run();
NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
}
} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 95bc08a5dd..aebb8cab35 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -31,7 +31,9 @@
namespace arm_compute
{
-void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+namespace experimental
+{
+void NEPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
@@ -46,7 +48,12 @@ Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen
return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
}
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+MemoryRequirements NEPixelWiseMultiplication::workspace() const
+{
+ return MemoryRequirements{};
+}
+
+void NEComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
@@ -60,4 +67,85 @@ Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, con
return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
}
+MemoryRequirements NEComplexPixelWiseMultiplication::workspace() const
+{
+ return MemoryRequirements{};
+}
+} // namespace experimental
+
+struct NEPixelWiseMultiplication::Impl
+{
+ const ITensor *src_0{ nullptr };
+ const ITensor *src_1{ nullptr };
+ ITensor *dst{ nullptr };
+ std::unique_ptr<experimental::NEPixelWiseMultiplication> op{ nullptr };
+};
+
+NEPixelWiseMultiplication::NEPixelWiseMultiplication()
+ : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEPixelWiseMultiplication::NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default;
+NEPixelWiseMultiplication &NEPixelWiseMultiplication::operator=(NEPixelWiseMultiplication &&) = default;
+NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
+
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ return experimental::NEPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ _impl->src_0 = input1;
+ _impl->src_1 = input2;
+ _impl->dst = output;
+ _impl->op = arm_compute::support::cpp14::make_unique<experimental::NEPixelWiseMultiplication>();
+ _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+}
+
+void NEPixelWiseMultiplication::run()
+{
+ const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+ const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+ _impl->op->run(src, dst, {});
+}
+
+struct NEComplexPixelWiseMultiplication::Impl
+{
+ ITensor *src_0{ nullptr };
+ ITensor *src_1{ nullptr };
+ ITensor *dst{ nullptr };
+ std::unique_ptr<experimental::NEComplexPixelWiseMultiplication> op{ nullptr };
+};
+
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
+ : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default;
+NEComplexPixelWiseMultiplication &NEComplexPixelWiseMultiplication::operator=(NEComplexPixelWiseMultiplication &&) = default;
+NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+ return experimental::NEComplexPixelWiseMultiplication::validate(input1, input2, output, act_info);
+}
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+ _impl->src_0 = input1;
+ _impl->src_1 = input2;
+ _impl->dst = output;
+ _impl->op = arm_compute::support::cpp14::make_unique<experimental::NEComplexPixelWiseMultiplication>();
+ _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+void NEComplexPixelWiseMultiplication::run()
+{
+ const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+ const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+ _impl->op->run(src, dst, {});
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 6eb1844a1f..018d0f4d0e 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -357,7 +357,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
input_activation_input->allocator()->allocate();
}
// Cell.
- // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+ // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
_pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
@@ -392,7 +392,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
if(_has_peephole)
{
- // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+ // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
_mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_output_res);
@@ -426,7 +426,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Hidden.
_hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
- // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+ // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
_memory_group.manage(&_hidden_mul_res);
const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
_hidden_mul_res.allocator()->init(hidden_mul_res);
@@ -667,8 +667,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
@@ -737,8 +737,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
if(lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
@@ -755,8 +755,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
}
// Cell.
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
if(quantized_cell_clip > 0)
{
@@ -776,12 +776,12 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
- // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+ // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
}
@@ -799,7 +799,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
@@ -897,7 +897,7 @@ void NEQLSTMLayer::run()
if(_has_peephole)
{
- NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY);
+ _pixelwise_mul_cell_to_forget.run();
_cell_to_forget_outstage.run();
_accumulate_cell_forget.run();
}
@@ -939,7 +939,7 @@ void NEQLSTMLayer::run()
if(_has_peephole)
{
- NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY);
+ _pixelwise_mul_cell_to_input.run();
_cell_to_input_outstage.run();
_accumulate_cell_input.run();
}
@@ -953,8 +953,8 @@ void NEQLSTMLayer::run()
}
// Cell.
- NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY);
- NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY);
+ _pixelwise_mul_forget_cell.run();
+ _pixelwise_mul_input_cell.run();
_add_forget_cell.run();
if(_has_cell_clipping)
@@ -970,7 +970,7 @@ void NEQLSTMLayer::run()
_accumulate_input_recurrent_output.run();
if(_has_peephole)
{
- NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY);
+ _pixelwise_mul_cell_to_output.run();
_cell_to_output_outstage.run();
_accumulate_cell_to_output.run();
}
@@ -984,7 +984,7 @@ void NEQLSTMLayer::run()
// Hidden.
_hidden_tanh.run();
- NEScheduler::get().schedule(&_pixelwise_mul_hidden, Window::DimY);
+ _pixelwise_mul_hidden.run();
_hidden_outstage.run();
// Projection.