From ad7515d231acb075a9585e52f257373b1a1b5d1f Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Fri, 24 Jul 2020 00:02:23 +0100
Subject: COMPMID-3385: Async support to CLArithmetic* kernels/functions Pt.1

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I94007565e688f8a0aead4f14c9fc30bfd9f9f7eb
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3613
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../core/CL/kernels/CLElementwiseOperationKernel.h |  56 +--
 arm_compute/core/CL/kernels/CLFillBorderKernel.h   |  12 +-
 .../runtime/CL/functions/CLElementwiseOperations.h | 500 ++++++++++++++++++++-
 arm_compute/runtime/CL/functions/CLLSTMLayer.h     | 169 ++++---
 arm_compute/runtime/CL/functions/CLPReluLayer.h    |  65 ++-
 arm_compute/runtime/CL/functions/CLQLSTMLayer.h    | 137 +++---
 arm_compute/runtime/CL/functions/CLRNNLayer.h      |  22 +-
 7 files changed, 753 insertions(+), 208 deletions(-)

(limited to 'arm_compute')

diff --git a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
index 1995aed7b6..76bc879638 100644
--- a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
+++ b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
@@ -54,7 +54,7 @@ public:
     ~CLElementwiseOperationKernel() = default;
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue) override;
 
     BorderSize border_size() const override;
 
@@ -64,9 +64,9 @@ protected:
 
     /** Initialise the kernel's output.
      *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1 First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
      *
      * @return a pair of Status and Window
      */
@@ -87,18 +87,18 @@ protected:
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
 
     ActivationLayerInfo _act_info;
 
 private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
+    const ITensorInfo *_input1; /**< Source tensor info 1 */
+    const ITensorInfo *_input2; /**< Source tensor info 2 */
+    ITensorInfo       *_output; /**< Destination tensor info */
 };
 
 /** Addition operation */
@@ -113,32 +113,32 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
      *
      * @param[in] compile_context The compile context to be used.
      * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
      * @param[in] policy          Policy to use to handle overflow.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
+    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] input1   First tensor input info info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info info. Data types supported: Same as @p input1.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
@@ -170,22 +170,22 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
      *
      * @param[in] compile_context The compile context to be used.
      * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
index 0a4de25ac3..8cad68dc1a 100644
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
@@ -49,6 +49,15 @@ public:
     /** Default destructor */
     ~CLFillBorderKernel() = default;
 
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]     compile_context       The compile context to be used.
+     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +74,7 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *
@@ -76,6 +85,7 @@ public:
     void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
 
     // Inherited methods overridden:
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue) override;
     void run(const Window &window, cl::CommandQueue &queue) override;
     bool is_parallelisable() const override;
 
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 9cd3c150cc..5af24c90ac 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -24,21 +24,372 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 #define ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
+namespace experimental
+{
+/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class CLArithmeticAddition : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLArithmeticAddition();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtraction : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLArithmeticSubtraction();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an arithmetic division between two tensors.
+ */
+class CLArithmeticDivision : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLArithmeticDivision();
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
+     *
+     * @param[in] input1   First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for max
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class CLElementwiseMax : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwiseMax();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for max
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for min
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class CLElementwiseMin : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwiseMin();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for min
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class CLElementwiseSquaredDiff : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwiseSquaredDiff();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for squared difference
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ */
+class CLElementwisePower : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwisePower();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported:F16/F32.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for power
+     *
+     * @param[in] input1   First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: F16/F32.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+} // namespace experimental
+
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
  * @note The function performs an arithmetic addition between two tensors.
  */
-class CLArithmeticAddition : public ICLSimpleFunction
+class CLArithmeticAddition : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLArithmeticAddition();
+    /** Default Destructor */
+    ~CLArithmeticAddition();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAddition(const CLArithmeticAddition &) = delete;
+    /** Default move constructor */
+    CLArithmeticAddition(CLArithmeticAddition &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAddition &operator=(const CLArithmeticAddition &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticAddition &operator=(CLArithmeticAddition &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -89,7 +440,8 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -115,6 +467,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction
@@ -122,9 +481,21 @@ public:
  * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  */
-class CLArithmeticSubtraction : public ICLSimpleFunction
+class CLArithmeticSubtraction : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLArithmeticSubtraction();
+    /** Default Destructor */
+    ~CLArithmeticSubtraction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtraction(const CLArithmeticSubtraction &) = delete;
+    /** Default move constructor */
+    CLArithmeticSubtraction(CLArithmeticSubtraction &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtraction &operator=(const CLArithmeticSubtraction &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticSubtraction &operator=(CLArithmeticSubtraction &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -149,7 +520,7 @@ public:
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -175,7 +546,8 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -201,6 +573,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division
@@ -208,9 +587,21 @@ public:
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs an arithmetic division between two tensors.
  */
-class CLArithmeticDivision : public ICLSimpleFunction
+class CLArithmeticDivision : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLArithmeticDivision();
+    /** Default Destructor */
+    ~CLArithmeticDivision();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticDivision(const CLArithmeticDivision &) = delete;
+    /** Default move constructor */
+    CLArithmeticDivision(CLArithmeticDivision &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticDivision &operator=(const CLArithmeticDivision &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticDivision &operator=(CLArithmeticDivision &&);
     /** Initialise the kernel's inputs, output.
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
@@ -231,7 +622,7 @@ public:
      * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -242,6 +633,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for max
@@ -249,9 +647,21 @@ public:
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CLElementwiseMax : public ICLSimpleFunction
+class CLElementwiseMax : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseMax();
+    /** Default Destructor */
+    ~CLElementwiseMax();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMax(const CLElementwiseMax &) = delete;
+    /** Default move constructor */
+    CLElementwiseMax(CLElementwiseMax &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMax &operator=(const CLElementwiseMax &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseMax &operator=(CLElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -283,6 +693,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for min
@@ -290,9 +707,21 @@ public:
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CLElementwiseMin : public ICLSimpleFunction
+class CLElementwiseMin : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseMin();
+    /** Default Destructor */
+    ~CLElementwiseMin();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMin(const CLElementwiseMin &) = delete;
+    /** Default move constructor */
+    CLElementwiseMin(CLElementwiseMin &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMin &operator=(const CLElementwiseMin &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseMin &operator=(CLElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -324,6 +753,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for squared difference
@@ -331,9 +767,21 @@ public:
  * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
  */
-class CLElementwiseSquaredDiff : public ICLSimpleFunction
+class CLElementwiseSquaredDiff : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseSquaredDiff();
+    /** Default Destructor */
+    ~CLElementwiseSquaredDiff();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseSquaredDiff(const CLElementwiseSquaredDiff &) = delete;
+    /** Default move constructor */
+    CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseSquaredDiff &operator=(const CLElementwiseSquaredDiff &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseSquaredDiff &operator=(CLElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -365,6 +813,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for power
@@ -372,9 +827,21 @@ public:
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
  */
-class CLElementwisePower : public ICLSimpleFunction
+class CLElementwisePower : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwisePower();
+    /** Default Destructor */
+    ~CLElementwisePower();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwisePower(const CLElementwisePower &) = delete;
+    /** Default move constructor */
+    CLElementwisePower(CLElementwisePower &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwisePower &operator=(const CLElementwisePower &) = delete;
+    /** Default move assignment operator */
+    CLElementwisePower &operator=(CLElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
@@ -406,6 +873,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index e5733cd784..abfcc3a62f 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -27,7 +27,6 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/Types.h"
@@ -201,90 +200,90 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                          _memory_group;
-    CLFullyConnectedLayer                _fully_connected_input_gate;
-    CLArithmeticAddition                 _accum_input_gate1;
-    CLSaturatedArithmeticOperationKernel _subtract_input_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_gate;
-    CLActivationLayer                    _activation_input_gate;
-    CLFullyConnectedLayer                _fully_connected_forget_gate;
-    CLArithmeticAddition                 _accum_forget_gate1;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_gate;
-    CLActivationLayer                    _activation_forget_gate;
-    CLFullyConnectedLayer                _fully_connected_cell_state;
-    CLGEMM                               _gemm_cell_state1;
-    CLTransposeKernel                    _transpose_cell_state;
-    CLSaturatedArithmeticOperationKernel _accum_cell_state1;
-    CLSaturatedArithmeticOperationKernel _accum_cell_state2;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_state1;
-    CLActivationLayer                    _activation_cell_state;
-    CLActivationLayer                    _cell_clip;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_state2;
-    CLFullyConnectedLayer                _fully_connected_output;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_state1;
-    CLArithmeticAddition                 _accum_output1;
-    CLActivationLayer                    _activation_output;
-    CLActivationLayer                    _activation_output_state;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_state2;
-    CLFullyConnectedLayer                _fully_connected_output_state;
-    CLActivationLayer                    _projection_clip;
-    CLCopyKernel                         _copy_cell_state;
-    CLCopyKernel                         _copy_output;
-    CLConcatenateLayer                   _concat_scratch_buffer;
-    CLConcatenateLayer                   _concat_inputs_forget_gate;
-    CLConcatenateLayer                   _concat_weights_forget_gate;
-    CLConcatenateLayer                   _concat_weights_input_gate;
-    CLConcatenateLayer                   _concat_weights_output;
-    CLMemsetKernel                       _ones_memset_kernel;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_input_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_input_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_forget_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_forget_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_cell_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_cell_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_output_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_output_gate_bias;
-    CLTensor                             _input_gate_out1;
-    CLTensor                             _input_gate_out2;
-    CLTensor                             _input_gate_out3;
-    CLTensor                             _input_gate_out4;
-    CLTensor                             _forget_gate_out1;
-    CLTensor                             _forget_gate_out2;
-    CLTensor                             _forget_gate_out3;
-    CLTensor                             _forget_gate_out4;
-    CLTensor                             _forget_gate_out5;
-    CLTensor                             _forget_gate_out6;
-    CLTensor                             _cell_state_out1;
-    CLTensor                             _cell_state_out2;
-    CLTensor                             _cell_state_out3;
-    CLTensor                             _cell_state_out4;
-    CLTensor                             _cell_state_out5;
-    CLTensor                             _output1;
-    CLTensor                             _output2;
-    CLTensor                             _output3;
-    CLTensor                             _output4;
-    CLTensor                             _cell_state_activation;
-    CLTensor                             _output_state1;
-    CLTensor                             _ones;
-    CLTensor                             _input_layer_norm_out1;
-    CLTensor                             _input_layer_norm_out2;
-    CLTensor                             _forget_layer_norm_out1;
-    CLTensor                             _forget_layer_norm_out2;
-    CLTensor                             _cell_layer_norm_out1;
-    CLTensor                             _cell_layer_norm_out2;
-    CLTensor                             _output_layer_norm_out1;
-    CLTensor                             _output_layer_norm_out2;
-    bool                                 _run_peephole_opt;
-    bool                                 _run_cifg_opt;
-    bool                                 _perform_cell_clipping;
-    bool                                 _has_projection_weights;
-    bool                                 _perform_projection_clipping;
-    bool                                 _is_prepared;
-    bool                                 _is_layer_norm_lstm;
+    MemoryGroup                     _memory_group;
+    CLFullyConnectedLayer           _fully_connected_input_gate;
+    CLArithmeticAddition            _accum_input_gate1;
+    CLArithmeticSubtraction         _subtract_input_gate;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate;
+    CLActivationLayer               _activation_input_gate;
+    CLFullyConnectedLayer           _fully_connected_forget_gate;
+    CLArithmeticAddition            _accum_forget_gate1;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate;
+    CLActivationLayer               _activation_forget_gate;
+    CLFullyConnectedLayer           _fully_connected_cell_state;
+    CLGEMM                          _gemm_cell_state1;
+    CLTransposeKernel               _transpose_cell_state;
+    CLArithmeticAddition            _accum_cell_state1;
+    CLArithmeticAddition            _accum_cell_state2;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1;
+    CLActivationLayer               _activation_cell_state;
+    CLActivationLayer               _cell_clip;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2;
+    CLFullyConnectedLayer           _fully_connected_output;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_output_state1;
+    CLArithmeticAddition            _accum_output1;
+    CLActivationLayer               _activation_output;
+    CLActivationLayer               _activation_output_state;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_output_state2;
+    CLFullyConnectedLayer           _fully_connected_output_state;
+    CLActivationLayer               _projection_clip;
+    CLCopyKernel                    _copy_cell_state;
+    CLCopyKernel                    _copy_output;
+    CLConcatenateLayer              _concat_scratch_buffer;
+    CLConcatenateLayer              _concat_inputs_forget_gate;
+    CLConcatenateLayer              _concat_weights_forget_gate;
+    CLConcatenateLayer              _concat_weights_input_gate;
+    CLConcatenateLayer              _concat_weights_output;
+    CLMemsetKernel                  _ones_memset_kernel;
+    CLMeanStdDevNormalizationLayer  _mean_std_norm_input_gate;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff;
+    CLArithmeticAddition            _accum_input_gate_bias;
+    CLMeanStdDevNormalizationLayer  _mean_std_norm_forget_gate;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff;
+    CLArithmeticAddition            _accum_forget_gate_bias;
+    CLMeanStdDevNormalizationLayer  _mean_std_norm_cell_gate;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff;
+    CLArithmeticAddition            _accum_cell_gate_bias;
+    CLMeanStdDevNormalizationLayer  _mean_std_norm_output_gate;
+    CLPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff;
+    CLArithmeticAddition            _accum_output_gate_bias;
+    CLTensor                        _input_gate_out1;
+    CLTensor                        _input_gate_out2;
+    CLTensor                        _input_gate_out3;
+    CLTensor                        _input_gate_out4;
+    CLTensor                        _forget_gate_out1;
+    CLTensor                        _forget_gate_out2;
+    CLTensor                        _forget_gate_out3;
+    CLTensor                        _forget_gate_out4;
+    CLTensor                        _forget_gate_out5;
+    CLTensor                        _forget_gate_out6;
+    CLTensor                        _cell_state_out1;
+    CLTensor                        _cell_state_out2;
+    CLTensor                        _cell_state_out3;
+    CLTensor                        _cell_state_out4;
+    CLTensor                        _cell_state_out5;
+    CLTensor                        _output1;
+    CLTensor                        _output2;
+    CLTensor                        _output3;
+    CLTensor                        _output4;
+    CLTensor                        _cell_state_activation;
+    CLTensor                        _output_state1;
+    CLTensor                        _ones;
+    CLTensor                        _input_layer_norm_out1;
+    CLTensor                        _input_layer_norm_out2;
+    CLTensor                        _forget_layer_norm_out1;
+    CLTensor                        _forget_layer_norm_out2;
+    CLTensor                        _cell_layer_norm_out1;
+    CLTensor                        _cell_layer_norm_out2;
+    CLTensor                        _output_layer_norm_out1;
+    CLTensor                        _output_layer_norm_out2;
+    bool                            _run_peephole_opt;
+    bool                            _run_cifg_opt;
+    bool                            _perform_cell_clipping;
+    bool                            _has_projection_weights;
+    bool                            _perform_projection_clipping;
+    bool                            _is_prepared;
+    bool                            _is_layer_norm_lstm;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h
index eb3d3be3e3..08567cccfb 100644
--- a/arm_compute/runtime/CL/functions/CLPReluLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h
@@ -24,20 +24,72 @@
 #ifndef ARM_COMPUTE_CLPRELULAYER_H
 #define ARM_COMPUTE_CLPRELULAYER_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
+namespace experimental
+{
 /** Basic function to run @ref CLArithmeticOperationKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
-class CLPReluLayer : public ICLSimpleFunction
+class CLPReluLayer : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLPReluLayer();
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPReluLayer
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] alpha  PRelu layer parameters. Data types supported: same of @p input.
+     * @param[in] output Destination tensor info. Data type supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+} // namespace experimental
+
+/** Basic function to run @ref CLArithmeticOperationKernel for PRELU
+ *
+ * @note The function implements an activation layer with the PRELU activation function.
+ */
+class CLPReluLayer : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLPReluLayer();
+    /** Default Destructor */
+    ~CLPReluLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPReluLayer(const CLPReluLayer &) = delete;
+    /** Default move constructor */
+    CLPReluLayer(CLPReluLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPReluLayer &operator=(const CLPReluLayer &) = delete;
+    /** Default move assignment operator */
+    CLPReluLayer &operator=(CLPReluLayer &&);
     /** Set the input and output tensor.
      *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
@@ -66,6 +118,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPRELULAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 97ae9878ea..0aea91ae8e 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -25,12 +25,12 @@
 #define ARM_COMPUTE_CLQLSTMLAYER_H
 
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
@@ -48,7 +48,7 @@ class ICLTensor;
  *
  * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
  * -# @ref CLCopyKernel                                          Copy kernel for copying output_state_out to output
- * -# @ref CLSaturatedArithmeticOperationKernel                  Elementwise addition and subtraction
+ * -# @ref CLArithmeticAddition                  Elementwise addition and subtraction
  * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
  * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
  * -# @ref CLGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
@@ -285,70 +285,70 @@ private:
     };
 
     // Functions used
-    CLTranspose                          _transpose_input_to_forget_weights{};
-    CLTranspose                          _transpose_input_to_cell_weights{};
-    CLTranspose                          _transpose_input_to_output_weights{};
-    CLTranspose                          _transpose_input_to_input_weights{};
-    CLTranspose                          _transpose_recurrent_to_forget_weights{};
-    CLTranspose                          _transpose_recurrent_to_cell_weights{};
-    CLTranspose                          _transpose_recurrent_to_output_weights{};
-    CLTranspose                          _transpose_recurrent_to_input_weights{};
-    CLTranspose                          _transpose_projection_weights{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _projection_reduction{};
-    CLSaturatedArithmeticOperationKernel _projection_bias_add{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_forget{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_forget{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_forget{};
-    CLGEMMLowpOutputStage                _input_to_forget_outstage{};
-    CLGEMMLowpOutputStage                _recurrent_to_forget_outstage{};
-    CLGEMMLowpOutputStage                _cell_to_forget_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_forget{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_forget{};
-    CLActivationLayer                    _forget_gate_sigmoid{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_cell{};
-    CLGEMMLowpOutputStage                _input_to_cell_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_cell{};
-    CLGEMMLowpOutputStage                _recurrent_to_cell_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_modulation{};
-    CLActivationLayer                    _cell_gate_tanh{};
-    CLSaturatedArithmeticOperationKernel _input_gate_sub{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_input{};
-    CLGEMMLowpOutputStage                _input_to_input_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_input{};
-    CLGEMMLowpOutputStage                _recurrent_to_input_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_input{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_input{};
-    CLGEMMLowpOutputStage                _cell_to_input_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_input{};
-    CLActivationLayer                    _input_gate_sigmoid{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_cell{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_cell{};
-    CLSaturatedArithmeticOperationKernel _add_forget_cell{};
-    CLActivationLayer                    _cell_clip{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_output{};
-    CLGEMMLowpOutputStage                _input_to_output_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_output{};
-    CLGEMMLowpOutputStage                _recurrent_to_output_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_output{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_output{};
-    CLGEMMLowpOutputStage                _cell_to_output_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_to_output{};
-    CLActivationLayer                    _output_gate_sigmoid{};
-    CLActivationLayer                    _hidden_tanh{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_hidden{};
-    CLGEMMLowpOutputStage                _hidden_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_projection{};
-    CLGEMMLowpOutputStage                _projection_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_projection{};
-    CLActivationLayer                    _projection_clip{};
+    CLTranspose                      _transpose_input_to_forget_weights{};
+    CLTranspose                      _transpose_input_to_cell_weights{};
+    CLTranspose                      _transpose_input_to_output_weights{};
+    CLTranspose                      _transpose_input_to_input_weights{};
+    CLTranspose                      _transpose_recurrent_to_forget_weights{};
+    CLTranspose                      _transpose_recurrent_to_cell_weights{};
+    CLTranspose                      _transpose_recurrent_to_output_weights{};
+    CLTranspose                      _transpose_recurrent_to_input_weights{};
+    CLTranspose                      _transpose_projection_weights{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _projection_reduction{};
+    CLArithmeticAddition             _projection_bias_add{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
+    CLPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_forget{};
+    CLGEMMLowpOutputStage            _input_to_forget_outstage{};
+    CLGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
+    CLGEMMLowpOutputStage            _cell_to_forget_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_forget{};
+    CLArithmeticAddition             _accumulate_cell_forget{};
+    CLActivationLayer                _forget_gate_sigmoid{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
+    CLGEMMLowpOutputStage            _input_to_cell_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
+    CLGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_modulation{};
+    CLActivationLayer                _cell_gate_tanh{};
+    CLArithmeticSubtraction          _input_gate_sub{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
+    CLGEMMLowpOutputStage            _input_to_input_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
+    CLGEMMLowpOutputStage            _recurrent_to_input_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_input{};
+    CLPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_input{};
+    CLGEMMLowpOutputStage            _cell_to_input_outstage{};
+    CLArithmeticAddition             _accumulate_cell_input{};
+    CLActivationLayer                _input_gate_sigmoid{};
+    CLPixelWiseMultiplicationKernel  _pixelwise_mul_forget_cell{};
+    CLPixelWiseMultiplicationKernel  _pixelwise_mul_input_cell{};
+    CLArithmeticAddition             _add_forget_cell{};
+    CLActivationLayer                _cell_clip{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
+    CLGEMMLowpOutputStage            _input_to_output_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
+    CLGEMMLowpOutputStage            _recurrent_to_output_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_output{};
+    CLPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_output{};
+    CLGEMMLowpOutputStage            _cell_to_output_outstage{};
+    CLArithmeticAddition             _accumulate_cell_to_output{};
+    CLActivationLayer                _output_gate_sigmoid{};
+    CLActivationLayer                _hidden_tanh{};
+    CLPixelWiseMultiplicationKernel  _pixelwise_mul_hidden{};
+    CLGEMMLowpOutputStage            _hidden_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_projection{};
+    CLGEMMLowpOutputStage            _projection_outstage{};
+    CLArithmeticAddition             _accumulate_projection{};
+    CLActivationLayer                _projection_clip{};
     std::array<CLQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
     CLCopyKernel _copy_output{};
 
@@ -358,7 +358,10 @@ private:
     TensorCopyKernel _hidden_to_output_copy{};
 
     // Tensor pointers
-    const ICLTensor *_input_to_input_weights{ nullptr };
+    const ICLTensor *_input_to_input_weights
+    {
+        nullptr
+    };
     const ICLTensor *_recurrent_to_input_weights{ nullptr };
     const ICLTensor *_projection_bias{ nullptr };
     const ICLTensor *_input_to_forget_weights{ nullptr };
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 81f7810edd..9d1cb1a724 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_CLRNN_LAYER_H
 
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
@@ -85,16 +85,16 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                          _memory_group;
-    CLGEMM                               _gemm_state_f;
-    CLSaturatedArithmeticOperationKernel _add_kernel;
-    CLActivationLayer                    _activation;
-    CLFullyConnectedLayer                _fully_connected_kernel;
-    CLCopyKernel                         _copy_kernel;
-    CLTensor                             _fully_connected_out;
-    CLTensor                             _gemm_output;
-    CLTensor                             _add_output;
-    bool                                 _is_prepared;
+    MemoryGroup           _memory_group;
+    CLGEMM                _gemm_state_f;
+    CLArithmeticAddition  _add_kernel;
+    CLActivationLayer     _activation;
+    CLFullyConnectedLayer _fully_connected_kernel;
+    CLCopyKernel          _copy_kernel;
+    CLTensor              _fully_connected_out;
+    CLTensor              _gemm_output;
+    CLTensor              _add_output;
+    bool                  _is_prepared;
 };
 }
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */
-- 
cgit v1.2.1