From 6eb73458c4869165c88d33c6a745a91cdc73a36a Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 2 Jul 2020 17:39:25 +0100
Subject: COMPMID-3373: Async support to NEArithmetic* kernels/functions (Pt.
 2)

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: Iec06adb535aaf7efb1838d921e8d6bb978b7b215
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3498
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEON/kernels/NEPixelWiseMultiplicationKernel.h |  33 +---
 arm_compute/runtime/NEON/functions/NELSTMLayer.h   | 168 ++++++++++-----------
 .../runtime/NEON/functions/NENormalizationLayer.h  |  12 +-
 .../NEON/functions/NEPixelWiseMultiplication.h     | 139 ++++++++++++++++-
 arm_compute/runtime/NEON/functions/NEQLSTMLayer.h  |  16 +-
 .../kernels/NEPixelWiseMultiplicationKernel.cpp    |  74 ++++-----
 src/runtime/NEON/functions/NELSTMLayer.cpp         |  48 +++---
 .../NEON/functions/NENormalizationLayer.cpp        |   9 +-
 .../NEON/functions/NEPixelWiseMultiplication.cpp   |  92 ++++++++++-
 src/runtime/NEON/functions/NEQLSTMLayer.cpp        |  38 ++---
 10 files changed, 407 insertions(+), 222 deletions(-)

diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 3cb0874a2f..5483fae565 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -72,7 +72,7 @@ public:
      * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
      * @param[in]  rounding_policy Rounding policy.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
@@ -98,8 +98,8 @@ public:
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    // Inherited methods overridden
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override;
 
 private:
     /** Common signature for all the specialised multiplication functions with integer scaling factor
@@ -136,11 +136,8 @@ private:
     MulFunctionQuantized *_func_quantized;
 
 private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    float          _scale;
-    int            _scale_exponent;
+    float _scale;
+    int   _scale_exponent;
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -151,23 +148,13 @@ public:
     {
         return "NEComplexPixelWiseMultiplicationKernel";
     }
-    /** Default constructor.*/
-    NEComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel &operator=(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel(NEComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel &operator=(NEComplexPixelWiseMultiplicationKernel &&) = default;
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
      * @param[in]  input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[out] output The output tensor, Data types supported: same as @p input1.  Number of channels supported: same as @p input1.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
      *
      * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -179,13 +166,7 @@ public:
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override;
 };
 
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index b9b581c484..2e2de61c95 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
@@ -36,6 +35,7 @@
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -146,89 +146,89 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                     _memory_group;
-    NEFullyConnectedLayer           _fully_connected_input_gate;
-    NEArithmeticAddition            _accum_input_gate1;
-    NEArithmeticSubtraction         _subtract_input_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate;
-    NEActivationLayer               _activation_input_gate;
-    NEFullyConnectedLayer           _fully_connected_forget_gate;
-    NEArithmeticAddition            _accum_forget_gate1;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate;
-    NEActivationLayer               _activation_forget_gate;
-    NEFullyConnectedLayer           _fully_connected_cell_state;
-    NEGEMM                          _gemm_cell_state1;
-    NETransposeKernel               _transpose_cell_state;
-    NEArithmeticAddition            _accum_cell_state1;
-    NEArithmeticAddition            _accum_cell_state2;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1;
-    NEActivationLayer               _activation_cell_state;
-    NEActivationLayer               _cell_clip;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2;
-    NEFullyConnectedLayer           _fully_connected_output;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state1;
-    NEArithmeticAddition            _accum_output1;
-    NEActivationLayer               _activation_output;
-    NEActivationLayer               _activation_output_state;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state2;
-    NEFullyConnectedLayer           _fully_connected_output_state;
-    NEActivationLayer               _projection_clip;
-    NECopyKernel                    _copy_cell_state;
-    NECopyKernel                    _copy_output;
-    NEConcatenateLayer              _concat_scratch_buffer;
-    NEConcatenateLayer              _concat_inputs_forget_gate;
-    NEConcatenateLayer              _concat_weights_forget_gate;
-    NEConcatenateLayer              _concat_weights_input_gate;
-    NEConcatenateLayer              _concat_weights_output;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_input_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff;
-    NEArithmeticAddition            _accum_input_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_forget_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff;
-    NEArithmeticAddition            _accum_forget_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_cell_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff;
-    NEArithmeticAddition            _accum_cell_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_output_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff;
-    NEArithmeticAddition            _accum_output_gate_bias;
-    Tensor                          _input_gate_out1;
-    Tensor                          _input_gate_out2;
-    Tensor                          _input_gate_out3;
-    Tensor                          _input_gate_out4;
-    Tensor                          _forget_gate_out1;
-    Tensor                          _forget_gate_out2;
-    Tensor                          _forget_gate_out3;
-    Tensor                          _forget_gate_out4;
-    Tensor                          _forget_gate_out5;
-    Tensor                          _forget_gate_out6;
-    Tensor                          _cell_state_out1;
-    Tensor                          _cell_state_out2;
-    Tensor                          _cell_state_out3;
-    Tensor                          _cell_state_out4;
-    Tensor                          _cell_state_out5;
-    Tensor                          _output1;
-    Tensor                          _output2;
-    Tensor                          _output3;
-    Tensor                          _output4;
-    Tensor                          _cell_state_activation;
-    Tensor                          _output_state1;
-    Tensor                          _ones;
-    Tensor                          _input_layer_norm_out1;
-    Tensor                          _input_layer_norm_out2;
-    Tensor                          _forget_layer_norm_out1;
-    Tensor                          _forget_layer_norm_out2;
-    Tensor                          _cell_layer_norm_out1;
-    Tensor                          _cell_layer_norm_out2;
-    Tensor                          _output_layer_norm_out1;
-    Tensor                          _output_layer_norm_out2;
-    bool                            _run_peephole_opt;
-    bool                            _run_cifg_opt;
-    bool                            _perform_cell_clipping;
-    bool                            _has_projection_weights;
-    bool                            _perform_projection_clipping;
-    bool                            _is_prepared;
-    bool                            _is_layer_norm_lstm;
+    MemoryGroup                    _memory_group;
+    NEFullyConnectedLayer          _fully_connected_input_gate;
+    NEArithmeticAddition           _accum_input_gate1;
+    NEArithmeticSubtraction        _subtract_input_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_input_gate;
+    NEActivationLayer              _activation_input_gate;
+    NEFullyConnectedLayer          _fully_connected_forget_gate;
+    NEArithmeticAddition           _accum_forget_gate1;
+    NEPixelWiseMultiplication      _pixelwise_mul_forget_gate;
+    NEActivationLayer              _activation_forget_gate;
+    NEFullyConnectedLayer          _fully_connected_cell_state;
+    NEGEMM                         _gemm_cell_state1;
+    NETransposeKernel              _transpose_cell_state;
+    NEArithmeticAddition           _accum_cell_state1;
+    NEArithmeticAddition           _accum_cell_state2;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_state1;
+    NEActivationLayer              _activation_cell_state;
+    NEActivationLayer              _cell_clip;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_state2;
+    NEFullyConnectedLayer          _fully_connected_output;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_state1;
+    NEArithmeticAddition           _accum_output1;
+    NEActivationLayer              _activation_output;
+    NEActivationLayer              _activation_output_state;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_state2;
+    NEFullyConnectedLayer          _fully_connected_output_state;
+    NEActivationLayer              _projection_clip;
+    NECopyKernel                   _copy_cell_state;
+    NECopyKernel                   _copy_output;
+    NEConcatenateLayer             _concat_scratch_buffer;
+    NEConcatenateLayer             _concat_inputs_forget_gate;
+    NEConcatenateLayer             _concat_weights_forget_gate;
+    NEConcatenateLayer             _concat_weights_input_gate;
+    NEConcatenateLayer             _concat_weights_output;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_input_gate_coeff;
+    NEArithmeticAddition           _accum_input_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_forget_gate_coeff;
+    NEArithmeticAddition           _accum_forget_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_gate_coeff;
+    NEArithmeticAddition           _accum_cell_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_gate_coeff;
+    NEArithmeticAddition           _accum_output_gate_bias;
+    Tensor                         _input_gate_out1;
+    Tensor                         _input_gate_out2;
+    Tensor                         _input_gate_out3;
+    Tensor                         _input_gate_out4;
+    Tensor                         _forget_gate_out1;
+    Tensor                         _forget_gate_out2;
+    Tensor                         _forget_gate_out3;
+    Tensor                         _forget_gate_out4;
+    Tensor                         _forget_gate_out5;
+    Tensor                         _forget_gate_out6;
+    Tensor                         _cell_state_out1;
+    Tensor                         _cell_state_out2;
+    Tensor                         _cell_state_out3;
+    Tensor                         _cell_state_out4;
+    Tensor                         _cell_state_out5;
+    Tensor                         _output1;
+    Tensor                         _output2;
+    Tensor                         _output3;
+    Tensor                         _output4;
+    Tensor                         _cell_state_activation;
+    Tensor                         _output_state1;
+    Tensor                         _ones;
+    Tensor                         _input_layer_norm_out1;
+    Tensor                         _input_layer_norm_out2;
+    Tensor                         _forget_layer_norm_out1;
+    Tensor                         _forget_layer_norm_out2;
+    Tensor                         _cell_layer_norm_out1;
+    Tensor                         _cell_layer_norm_out2;
+    Tensor                         _output_layer_norm_out1;
+    Tensor                         _output_layer_norm_out2;
+    bool                           _run_peephole_opt;
+    bool                           _run_cifg_opt;
+    bool                           _perform_cell_clipping;
+    bool                           _has_projection_weights;
+    bool                           _perform_projection_clipping;
+    bool                           _is_prepared;
+    bool                           _is_layer_norm_lstm;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NELSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index 8683e44d3c..bead01457f 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -28,10 +28,10 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -42,7 +42,7 @@ class ITensor;
 
 /** Basic function to compute a normalization layer. This function calls the following NEON kernels:
  *
- * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEPixelWiseMultiplication
  * -# @ref NEFillBorderKernel
  * -# @ref NENormalizationLayerKernel
  *
@@ -75,10 +75,10 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                     _memory_group;    /**< Function memory group */
-    NENormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel */
-    NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
-    Tensor                          _input_squared;   /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                _memory_group;  /**< Function memory group */
+    NENormalizationLayerKernel _norm_kernel;   /**< Normalization layer kernel */
+    NEPixelWiseMultiplication  _multiply_f;    /**< Pixel multiplication function */
+    Tensor                     _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
 }
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index d84dff2c13..3b1209356a 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -25,15 +25,17 @@
 #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEPixelWiseMultiplicationKernel */
-class NEPixelWiseMultiplication : public INESimpleFunctionNoBorder
+class NEPixelWiseMultiplication : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
@@ -60,7 +62,7 @@ public:
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
@@ -88,12 +90,132 @@ public:
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
 };
 
 /** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
-class NEComplexPixelWiseMultiplication : public INESimpleFunction
+class NEComplexPixelWiseMultiplication : public INEOperator
 {
 public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2   An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output   The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
+     *
+     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
+class NEPixelWiseMultiplication : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEPixelWiseMultiplication();
+    /** Default Destructor */
+    ~NEPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    NEPixelWiseMultiplication(NEPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&);
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported:
+     *                                 - U8, only if both inputs are U8.
+     *                                 - QASYMM8, only if both inputs are QASYMM8.
+     *                                 - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
+     *                                 - S16.
+     *                                 - QSYMM16, only if both inputs are QSYMM16.
+     *                                 - S32, only if both inputs are QSYMM16.
+     *                                 - F16, only if @p input1 is F16.
+     *                                 - F32, only if both inputs are F32.
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     * @param[in]      rounding_policy Rounding policy.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] output          Output tensor info. Data types supported:
+     *                            - U8, only if both inputs are U8.
+     *                            - QASYMM8, only if both inputs are QASYMM8.
+     *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
+     *                            - S16.
+     *                            - QSYMM16, only if both inputs are QSYMM16.
+     *                            - S32, only if both inputs are QSYMM16.
+     *                            - F16, only if @p input1 is F16.
+     *                            - F32, only if both inputs are F32.
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     * @param[in] rounding_policy Rounding policy.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
+class NEComplexPixelWiseMultiplication : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEComplexPixelWiseMultiplication();
+    /** Default Destructor */
+    ~NEComplexPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&);
     /** Initialise the kernel's inputs, output.
      *
      * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -112,6 +234,13 @@ public:
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 }
 #endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 60c8fa1226..a19310d8ea 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
@@ -34,6 +33,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 #include "arm_compute/runtime/common/LSTMParams.h"
@@ -54,7 +54,7 @@ class ITensor;
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
  * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
  * -# @ref NEGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref NEPixelWiseMultiplicationKernel                       Elementwise multiplication
+ * -# @ref NEPixelWiseMultiplication                       Elementwise multiplication
  * -# @ref NETranspose                                           Transpose function for reshaping the weights
  * */
 class NEQLSTMLayer : public IFunction
@@ -257,7 +257,7 @@ private:
     NEArithmeticAddition             _projection_bias_add{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_forget{};
+    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_forget{};
     NEGEMMLowpOutputStage            _input_to_forget_outstage{};
     NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
     NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
@@ -276,12 +276,12 @@ private:
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
     NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
     NEArithmeticAddition             _accumulate_input_recurrent_input{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_input{};
+    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_input{};
     NEGEMMLowpOutputStage            _cell_to_input_outstage{};
     NEArithmeticAddition             _accumulate_cell_input{};
     NEActivationLayer                _input_gate_sigmoid{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_forget_cell{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_input_cell{};
+    NEPixelWiseMultiplication        _pixelwise_mul_forget_cell{};
+    NEPixelWiseMultiplication        _pixelwise_mul_input_cell{};
     NEArithmeticAddition             _add_forget_cell{};
     NEActivationLayer                _cell_clip{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
@@ -289,12 +289,12 @@ private:
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
     NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
     NEArithmeticAddition             _accumulate_input_recurrent_output{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_output{};
+    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_output{};
     NEGEMMLowpOutputStage            _cell_to_output_outstage{};
     NEArithmeticAddition             _accumulate_cell_to_output{};
     NEActivationLayer                _output_gate_sigmoid{};
     NEActivationLayer                _hidden_tanh{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_hidden{};
+    NEPixelWiseMultiplication        _pixelwise_mul_hidden{};
     NEGEMMLowpOutputStage            _hidden_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
     NEGEMMLowpOutputStage            _projection_outstage{};
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index b23a20d019..cd1c4b28cc 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -1060,27 +1060,24 @@ void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
 } // namespace
 
 NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
-    : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+    : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
 {
 }
 
-void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
 
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
     const TensorShape &out_shape    = broadcast_pair.first;
     const ValidRegion &valid_region = broadcast_pair.second;
 
     // Auto initialize output if not initialized
-    set_shape_if_empty(*output->info(), out_shape);
+    set_shape_if_empty(*output, out_shape);
 
-    _input1         = input1;
-    _input2         = input2;
-    _output         = output;
     _scale          = scale;
     _scale_exponent = 0;
     _func_quantized = nullptr;
@@ -1104,9 +1101,9 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
         _scale_exponent = std::abs(exponent - 1);
     }
 
-    const DataType dt_input1 = input1->info()->data_type();
-    const DataType dt_input2 = input2->info()->data_type();
-    const DataType dt_output = output->info()->data_type();
+    const DataType dt_input1 = input1->data_type();
+    const DataType dt_input2 = input2->data_type();
+    const DataType dt_output = output->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
     switch(dt_input1)
@@ -1207,8 +1204,8 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
 
     // Configure kernel window
     Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(valid_region);
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(valid_region);
     Window win = calculate_max_window(valid_region, Steps());
 
     INEKernel::configure(win);
@@ -1223,27 +1220,30 @@ Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, cons
     return Status{};
 }
 
-void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+void NEPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
+    auto input1 = inputs.at(TensorType::ACL_SRC_0);
+    auto input2 = inputs.at(TensorType::ACL_SRC_1);
+    auto output = outputs.at(TensorType::ACL_DST);
+
     if(_func_quantized != nullptr)
     {
-        (*_func_quantized)(_input1, _input2, _output, window, _scale);
+        (*_func_quantized)(input1, input2, output, window, _scale);
     }
     else if(_func_int != nullptr)
     {
-        (*_func_int)(_input1, _input2, _output, window, _scale_exponent);
+        (*_func_int)(input1, input2, output, window, _scale_exponent);
     }
     else
     {
         ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
-        (*_func_float)(_input1, _input2, _output, window, _scale);
+        (*_func_float)(input1, input2, output, window, _scale);
     }
 }
-
 namespace
 {
 constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
@@ -1296,24 +1296,15 @@ std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *inp
 }
 } // namespace
 
-NEComplexPixelWiseMultiplicationKernel::NEComplexPixelWiseMultiplicationKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void NEComplexPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+    auto win_config = validate_and_configure_window_complex(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
     // Create kernel
     INEKernel::configure(win_config.second);
 }
@@ -1327,27 +1318,24 @@ Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input
     return Status{};
 }
 
-void NEComplexPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+void NEComplexPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape()));
-    Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape()));
-    Iterator output(_output, window);
+    auto input1 = inputs.at(TensorType::ACL_SRC_0);
+    auto input2 = inputs.at(TensorType::ACL_SRC_1);
+    auto output = outputs.at(TensorType::ACL_DST);
+
+    Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape()));
+    Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape()));
+    Iterator output_it(output, window);
 
     execute_window_loop(window, [&](const Coordinates &)
     {
-        c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr());
+        c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr());
     },
-    input1, input2, output);
-}
-
-BorderSize NEComplexPixelWiseMultiplicationKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
-    return { 0, border, 0, 0 };
+    input1_it, input2_it, output_it);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index d8c684bf15..467c51b1a6 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -464,14 +464,14 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
 
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
@@ -498,14 +498,14 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
             ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
         if(lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
             ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
         }
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
@@ -522,13 +522,13 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
     if(cell_threshold != 0.f)
     {
@@ -548,22 +548,22 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
 
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     if(lstm_params.has_projection())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
@@ -603,13 +603,13 @@ void NELSTMLayer::run()
 
     if(_run_peephole_opt)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY);
+        _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY);
+        _pixelwise_mul_forget_gate_coeff.run();
         _accum_forget_gate_bias.run();
     }
     _activation_forget_gate.run();
@@ -632,14 +632,14 @@ void NELSTMLayer::run()
 
         if(_run_peephole_opt)
         {
-            NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
+            _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
         if(_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
-            NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY);
+            _pixelwise_mul_input_gate_coeff.run();
             _accum_input_gate_bias.run();
         }
         _activation_input_gate.run();
@@ -652,12 +652,12 @@ void NELSTMLayer::run()
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY);
+        _pixelwise_mul_cell_gate_coeff.run();
         _accum_cell_gate_bias.run();
     }
     _activation_cell_state.run();
-    NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY);
+    _pixelwise_mul_cell_state1.run();
+    _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
     if(_perform_cell_clipping)
@@ -668,19 +668,19 @@ void NELSTMLayer::run()
     _fully_connected_output.run();
     if(_run_peephole_opt)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
+        _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY);
+        _pixelwise_mul_output_gate_coeff.run();
         _accum_output_gate_bias.run();
     }
     _activation_output.run();
 
     _activation_output_state.run();
-    NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY);
+    _pixelwise_mul_output_state2.run();
 
     if(_has_projection_weights)
     {
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index f3a3ac6322..ab8cb656bd 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -33,7 +33,7 @@
 namespace arm_compute
 {
 NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _input_squared()
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared()
 {
 }
 
@@ -49,7 +49,7 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
 
     // Configure kernels
     _norm_kernel.configure(input, &_input_squared, output, norm_info);
-    _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
 
     // Allocate the tensor once the configure methods have been called
     _input_squared.allocator()->allocate();
@@ -61,7 +61,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     return Status{};
 }
@@ -69,8 +69,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
 void NENormalizationLayer::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+    _multiply_f.run();
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
 }
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 95bc08a5dd..aebb8cab35 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
-void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+namespace experimental
+{
+void NEPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -46,7 +48,12 @@ Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen
     return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
 }
 
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+MemoryRequirements NEPixelWiseMultiplication::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+void NEComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
@@ -60,4 +67,85 @@ Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, con
     return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
 }
 
+MemoryRequirements NEComplexPixelWiseMultiplication::workspace() const
+{
+    return MemoryRequirements{};
+}
+} // namespace experimental
+
+struct NEPixelWiseMultiplication::Impl
+{
+    const ITensor                                           *src_0{ nullptr };
+    const ITensor                                           *src_1{ nullptr };
+    ITensor                                                 *dst{ nullptr };
+    std::unique_ptr<experimental::NEPixelWiseMultiplication> op{ nullptr };
+};
+
+NEPixelWiseMultiplication::NEPixelWiseMultiplication()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEPixelWiseMultiplication::NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default;
+NEPixelWiseMultiplication &NEPixelWiseMultiplication::operator=(NEPixelWiseMultiplication &&) = default;
+NEPixelWiseMultiplication::~NEPixelWiseMultiplication()                                       = default;
+
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                                           const ActivationLayerInfo &act_info)
+{
+    return experimental::NEPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                                          const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEPixelWiseMultiplication>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+}
+
+void NEPixelWiseMultiplication::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+struct NEComplexPixelWiseMultiplication::Impl
+{
+    ITensor                                                        *src_0{ nullptr };
+    ITensor                                                        *src_1{ nullptr };
+    ITensor                                                        *dst{ nullptr };
+    std::unique_ptr<experimental::NEComplexPixelWiseMultiplication> op{ nullptr };
+};
+
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default;
+NEComplexPixelWiseMultiplication &NEComplexPixelWiseMultiplication::operator=(NEComplexPixelWiseMultiplication &&) = default;
+NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication()                                              = default;
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::NEComplexPixelWiseMultiplication::validate(input1, input2, output, act_info);
+}
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEComplexPixelWiseMultiplication>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+void NEComplexPixelWiseMultiplication::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 6eb1844a1f..018d0f4d0e 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -357,7 +357,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
         input_activation_input->allocator()->allocate();
     }
     // Cell.
-    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
@@ -392,7 +392,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     if(_has_peephole)
     {
-        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
@@ -426,7 +426,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Hidden.
     _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
-    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
@@ -667,8 +667,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
@@ -737,8 +737,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
         if(lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                                  RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+                                                                            RoundingPolicy::TO_ZERO));
             const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
             ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
@@ -755,8 +755,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
     if(quantized_cell_clip > 0)
     {
@@ -776,12 +776,12 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
-        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
@@ -799,7 +799,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
@@ -897,7 +897,7 @@ void NEQLSTMLayer::run()
 
     if(_has_peephole)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY);
+        _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
@@ -939,7 +939,7 @@ void NEQLSTMLayer::run()
 
         if(_has_peephole)
         {
-            NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY);
+            _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
@@ -953,8 +953,8 @@ void NEQLSTMLayer::run()
     }
 
     // Cell.
-    NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY);
+    _pixelwise_mul_forget_cell.run();
+    _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
 
     if(_has_cell_clipping)
@@ -970,7 +970,7 @@ void NEQLSTMLayer::run()
     _accumulate_input_recurrent_output.run();
     if(_has_peephole)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY);
+        _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
@@ -984,7 +984,7 @@ void NEQLSTMLayer::run()
 
     // Hidden.
     _hidden_tanh.run();
-    NEScheduler::get().schedule(&_pixelwise_mul_hidden, Window::DimY);
+    _pixelwise_mul_hidden.run();
     _hidden_outstage.run();
 
     // Projection.
-- 
cgit v1.2.1