71 files changed, 1778 insertions, 1339 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index b39a8d7701..5584fdc783 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H
 #define ARM_COMPUTE_NEACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 
 #include <memory>
@@ -101,5 +101,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-} // namespace arm_computes
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
new file mode 100644
index 0000000000..6c65c055dd
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD
+#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class ActivationLayerInfo;
+
+/** Function to compute Add+Mul+Add fused operation */
+class NEAddMulAdd : public IFunction
+{
+public:
+    /** Constructor */
+    NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAddMulAdd(const NEAddMulAdd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAddMulAdd(NEAddMulAdd &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAddMulAdd &operator=(const NEAddMulAdd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAddMulAdd &operator=(NEAddMulAdd &&) = delete;
+    /** Destructor */
+    ~NEAddMulAdd();
+    /** Initialize the function's inputs and outputs.
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * Valid data type configurations:
+     * |input1         |input2         |bn_mul         |bn_add         |add_output     |final_output   |
+     * |:--------------|:--------------|:--------------|:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |F32            |F32            |
+     *
+     * This is what this composite function (tailored for add followed by a batch norm operation) does:
+     *      add_output <- input1 + input2 (add)
+     *      final_output <- add_output * bn_mul + bn_add  (batch norm = mul+add)
+     *
+     * @param[in]  input1       First tensor input. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input2       Second tensor input. Data types supported: Same as @p input.
+     * @param[in]  bn_mul       The multiplication coefficient on the feature dimension. Data types supported: Same as @p input.
+     *                          It's one dimensional tensor with size equal to the feature maps [FM]
+     * @param[in]  bn_add       The addition coefficient on the feature dimension. Data types supported: Same as @p input.
+     *                          It's one dimensional tensor with size equal to the feature maps [FM]
+     * @param[out] add_output   Output of the first add. Data type supported: Same as @p input.
+     * @param[out] final_output Output of the add+mul+add+act composite operation. Data type supported: Same as @p input.
+     * @param[in]  policy       Policy to handle overflow
+     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *bn_mul,
+                   ITensor                   *bn_add,
+                   ITensor                   *add_output,
+                   ITensor                   *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd
+     *
+     * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor *
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD */
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 4392de7b28..3bb50a0f90 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NEARGMINMAXLAYER_H
 #define ARM_COMPUTE_NEARGMINMAXLAYER_H
 
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
@@ -33,7 +31,6 @@
 namespace arm_compute
 {
 class ITensor;
-
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
  *
@@ -68,13 +65,13 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |src            |dst        |
-     * |:--------------|:----------|
-     * |QASYMM8        |U32, S32   |
-     * |QASYMM8_SIGNED |U32, S32   |
-     * |S32            |U32, S32   |
-     * |F16            |U32, S32   |
-     * |F32            |U32, S32   |
+     * |src            |dst           |
+     * |:--------------|:-------------|
+     * |QASYMM8        |U32, S32      |
+     * |QASYMM8_SIGNED |U32, S32      |
+     * |S32            |U32, S32, S64 |
+     * |F16            |U32, S32      |
+     * |F32            |U32, S32      |
      *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
@@ -86,7 +83,7 @@ public:
      *
      * @param[in] input  Input source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in] axis   Axis to find max/min index.
-     * @param[in] output Output source tensor info. Data types supported: U32/S32.
+     * @param[in] output Output source tensor info. Data types supported: U32/S32/S64.
      * @param[in] op     Operation to perform: min or max
      *
      * @return a status
@@ -97,7 +94,8 @@ public:
     void run() override;
 
 private:
-    std::unique_ptr<NEReductionOperation> _reduction_function;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index b8e46ff36e..73a43dbc44 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_NEARITHMETICADDITION_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -62,9 +64,6 @@ public:
      * |QSYMM16        |QSYMM16        |QASYMM16       |
      * |QSYMM16        |QSYMM16        |S32            |
      * |U8             |U8             |U8             |
-     * |U8             |U8             |S16            |
-     * |U8             |S16            |S16            |
-     * |S16            |U8             |S16            |
      * |S16            |S16            |S16            |
      * |S32            |S32            |S32            |
      * |F16            |F16            |F16            |
@@ -76,7 +75,11 @@ public:
      * @param[in]  policy   Policy to use to handle overflow.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
@@ -87,7 +90,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 0c72e946f6..3e4f6356c5 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INEOperator.h"
 
@@ -68,9 +69,6 @@ public:
      * |QSYMM16        |QSYMM16        |QASYMM16       |
      * |QSYMM16        |QSYMM16        |S32            |
      * |U8             |U8             |U8             |
-     * |U8             |U8             |S16            |
-     * |U8             |S16            |S16            |
-     * |S16            |U8             |S16            |
      * |S16            |S16            |S16            |
      * |S32            |S32            |S32            |
      * |F16            |F16            |F16            |
@@ -82,7 +80,11 @@ public:
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
      * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
@@ -93,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index ec00fbdbf2..99e2dcadbb 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -81,7 +81,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer
      *
@@ -98,10 +104,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -109,5 +119,5 @@ public:
 private:
     std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index 810bf81a22..ebed0bea29 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
@@ -64,7 +63,10 @@ public:
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
     void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
     /** Set the input and output tensors. (Static block shape).
      *
@@ -72,8 +74,13 @@ public:
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output);
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -81,7 +88,9 @@ public:
      * @param[out] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
     static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape).
      *
@@ -89,10 +98,15 @@ public:
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output info. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 2a196a2de5..aa41fc0df2 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -57,7 +57,8 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform
      *
@@ -71,7 +72,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 30499f5ecf..43cae777f6 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECAST_H
 #define ARM_COMPUTE_NECAST_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -85,7 +84,7 @@ public:
      *
      * @return a status
      */
-    static Status validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
index 8888efec4f..bc19e1a4af 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
@@ -46,6 +46,7 @@ public:
      *
      * Valid data layouts:
      * - NCHW
+     * - NHWC
      *
      * Valid data type configurations:
      * |src            |dst            |
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index dd1c709d76..1600f85488 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECONCATENATELAYER_H
 #define ARM_COMPUTE_NECONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -87,7 +86,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h
new file mode 100644
index 0000000000..525f37f3e7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConv3D.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECONV3D_H
+#define ARM_COMPUTE_NECONV3D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to simulate a 3d convolution. This function calls one of the following functions:
+ * -# @ref cpu::CpuDirectConv3d
+ *
+ */
+class NEConv3D : public IFunction
+{
+public:
+    /** Constructor */
+    NEConv3D();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConv3D(const NEConv3D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConv3D &operator=(const NEConv3D &) = delete;
+    /** Default move constructor */
+    NEConv3D(NEConv3D &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConv3D &operator=(NEConv3D &&) = default;
+    /** Default destructor */
+    ~NEConv3D();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  input     Source tensor. 4 lower dimensions represent a single input [IFM, width, height, depth],
+     *                       while every optional dimension from 5 and above represent a batch of inputs.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_x, kernel_y, kernel_z].
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[out] output    Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs.
+     * @param[in]  conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo.
+     */
+    void configure(
+        ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to NEConv3D::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv3dInfo  &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECONV3D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index 218877d421..dc6b22d717 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/ITransformWeights.h"
-#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */
 class NEConvertFullyConnectedWeights : public IFunction
@@ -65,7 +65,8 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void
+    configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -75,7 +76,10 @@ public:
      *
      * @return A Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overriden:
     void run() override;
@@ -84,45 +88,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-
-namespace weights_transformations
-{
-/** Basic function to manage @ref NEConvertFullyConnectedWeights. */
-class NEConvertFullyConnectedWeightsManaged : public ITransformWeights
-{
-public:
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    uint32_t uid() override
-    {
-        return _uid;
-    }
-
-    void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
-    {
-        _func.configure(input, &_output, original_input_shape, data_layout);
-    }
-
-private:
-    static constexpr uint32_t      _uid = 0x4;
-    Tensor                         _output{};
-    NEConvertFullyConnectedWeights _func{};
-};
-} // namespace weights_transformations
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index f19aa8008b..2d07980ade 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NECONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -38,9 +38,9 @@ namespace arm_compute
 class ITensor;
 
 /** Basic function to simulate a convolution layer. This function calls one of the following functions:
- * -# @ref NEGEMMConvolutionLayer     (executed only in case GEMM is required for the operation)
- * -# @ref NEWinogradConvolutionLayer (executed only in case Winograd is required for the operation)
- * -# @ref NEDirectConvolutionLayer   (executed only in case Direct Convolution is required for the operation)
+ * -# @ref cpu::CpuGemmConv2d     (executed only in case GEMM is required for the operation)
+ * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation)
+ * -# @ref cpu::CpuDirectConv2d   (executed only in case Direct Convolution is required for the operation)
  * -# @ref NEFFTConvolutionLayer      (executed only in case FFT is required for the operation)
  *
  *
@@ -78,12 +78,12 @@ public:
     NEConvolutionLayer(const NEConvolutionLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEConvolutionLayer &operator=(const NEConvolutionLayer &) = delete;
+    /** Default move constructor */
+    NEConvolutionLayer(NEConvolutionLayer &&) = default;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionLayer(NEConvolutionLayer &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionLayer &operator=(NEConvolutionLayer &&) = delete;
+    NEConvolutionLayer &operator=(NEConvolutionLayer &&) = default;
     /** Default destructor */
-    ~NEConvolutionLayer() = default;
+    ~NEConvolutionLayer();
     /** Set the input and output tensors.
      *
      * Valid data layouts:
@@ -111,15 +111,23 @@ public:
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                              tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -133,7 +141,7 @@ public:
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                             tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
@@ -142,9 +150,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -156,7 +171,7 @@ public:
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                             tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
@@ -164,15 +179,21 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    std::shared_ptr<IMemoryManager> _memory_manager;
-    std::unique_ptr<IFunction>      _function; /**< Function to run */
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index ee02c259f4..840c03e968 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECOPY_H
 #define ARM_COMPUTE_NECOPY_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 143bbbc6f1..f806762158 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -75,8 +75,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ITensor      *input,
+                   const ITensor      *boxes,
+                   const ITensor      *box_ind,
+                   ITensor            *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -96,8 +101,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *boxes,
+                           const ITensorInfo  *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 34ab0707c2..aabe42f928 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -76,17 +75,16 @@ class NEDeconvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete;
+    /** Default move constructor */
+    NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDeconvolutionLayer &operator=(const NEDeconvolutionLayer &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains pointers) */
-    NEDeconvolutionLayer(NEDeconvolutionLayer &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains pointers) */
-    NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = delete;
+    /** Default move assignment operator */
+    NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = default;
     /** Default destructor */
-    virtual ~NEDeconvolutionLayer() = default;
+    ~NEDeconvolutionLayer() = default;
 
     /** Set the input, weights, biases and output tensors.
      *
@@ -104,25 +102,50 @@ public:
      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
      * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
      *
-     * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-     * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-     * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p input.
-     * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * @param[in,out] input            Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
+     *                                 Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+     * @param[in]     weights          The 4d weights with dimensions [width, height, IFM, OFM].
+     *                                 Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]     bias             Optional, ignored if NULL. The biases have one dimension.
+     *                                 Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+     * @param[out]    output           Output tensor. The output has the same number of dimensions as the @p input.
+     * @param[in]     info             Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
+     * @param[in]     enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                                 available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]     weights_info     (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+     *                                 the GEMM convolution.
      *
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info);
+    void configure(ITensor             *input,
+                   const ITensor       *weights,
+                   const ITensor       *bias,
+                   ITensor             *output,
+                   const PadStrideInfo &info,
+                   bool                 enable_fast_math = false,
+                   const WeightsInfo   &weights_info     = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer
      *
-     * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-     * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-     * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p input.
-     * @param[in] info    Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * @param[in] input            Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
+     *                             Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+     * @param[in] weights          The 4d weights info with dimensions [width, height, IFM, OFM].
+     *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] bias             (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+     * @param[in] output           Output tensor info. The output has the same number of dimensions as the @p input.
+     * @param[in] info             Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] weights_info     (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+     *                             the GEMM convolution.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &info,
+                           bool                 enable_fast_math = false,
+                           const WeightsInfo   &weights_info     = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -140,6 +163,7 @@ private:
     ITensor           *_input;
     PadStrideInfo      _info;
     bool               _is_prepared;
+    bool               _do_upsampling;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index eb0724ae12..7bfdfbd13d 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEDEPTHCONVERT_H
 #define ARM_COMPUTE_NEDEPTHCONVERT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -84,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
 
     // Inherited methods overridden
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index b9bdcd1f11..d27369670e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,26 +21,27 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
 class ITensorInfo;
+class NEDepthToSpaceLayerKernel;
 
 /** Basic function to run @ref NEDepthToSpaceLayerKernel. */
-class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
+class NEDepthToSpaceLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEDepthToSpaceLayer() = default;
+    NEDepthToSpaceLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -50,7 +51,7 @@ public:
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
     NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
     /** Default destructor */
-    ~NEDepthToSpaceLayer() = default;
+    ~NEDepthToSpaceLayer();
     /** Set the input and output tensors.
      *
      * Valid data layouts:
@@ -76,6 +77,11 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    void run() override;
+
+private:
+    std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 2f541758f4..6ad5aa7bfa 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -80,8 +81,14 @@ public:
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier = 1,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
      *
@@ -98,8 +105,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           unsigned int               depth_multiplier = 1,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
@@ -112,7 +125,7 @@ private:
     *
     * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
     * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
     * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
     * -# @ref NEActivationLayer if fused activation is required
     *
@@ -127,9 +140,11 @@ private:
         /** Default move constructor */
         NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Prevent instances of this class from being copied (As this class contains pointers) */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
         /** Default move assignment operator */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Default destructor */
         ~NEDepthwiseConvolutionLayerOptimizedInternal() = default;
         /** Initialize the function's source, destination, kernels and border_size.
@@ -144,8 +159,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
          *
@@ -161,8 +182,14 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
@@ -207,8 +234,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric
          *
@@ -225,8 +258,14 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index 2affa8d49e..7a94833d10 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <map>
@@ -78,8 +77,14 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
      *
      * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
@@ -93,8 +98,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 70352fdfaa..3ae3b2a15c 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -84,7 +85,12 @@ public:
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *bias,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
      *
      * @note: DirectConvolution only works in the following configurations:
@@ -105,7 +111,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index 95274bdb0c..ebf2277d1f 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INEOperator.h"
 
@@ -72,7 +73,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -82,7 +86,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -132,7 +139,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -142,7 +152,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -192,7 +205,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -202,7 +218,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -248,7 +267,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -258,7 +280,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -305,7 +330,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -315,7 +343,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -376,7 +407,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index 9654b1e604..99c6fd4eb4 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT1D_H
 #define ARM_COMPUTE_NEFFT1D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index 57f38d1942..cefd3df17a 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT2D_H
 #define ARM_COMPUTE_NEFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
 #include "arm_compute/runtime/Tensor.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index c5f4d45b6b..84bfe6b02f 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
@@ -94,8 +93,13 @@ public:
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index e923ce33e1..1829c71fef 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NEFILL_H
 #define ARM_COMPUTE_NEFILL_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index ab77c28839..44b1d4a62b 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -57,7 +58,10 @@ public:
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *input,
+                   unsigned int      border_width,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index 4d47b068db..77ac484bab 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFLOOR_H
 #define ARM_COMPUTE_NEFLOOR_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 9727e108a5..885f8430cf 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,16 +24,15 @@
 #ifndef ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H
 #define ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H
 
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 namespace weights_transformations
@@ -77,10 +76,10 @@ private:
 } // namespace weights_transformations
 
 /** Basic function to compute a Fully Connected layer. This function calls the following kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref cpu::kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
  *  -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- *  -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is not equal to nullptr)
+ *  -# @ref NEGEMM or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *  -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -88,7 +87,8 @@ class NEFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains pointers) */
@@ -113,66 +113,65 @@ public:
      * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
      *
-     * @param[in]  input   Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-     *                     If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                     If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
-     *                     - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                     - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  fc_info (Optional) Fully connected layer additional info
+     * @param[in]  input        Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights      Weights tensor. The weights must be 2 dimensional.
+     *                          If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+     *                          If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+     *                          Data type supported: Same as @p input.
+     * @param[in]  biases       Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output       Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
+     *                          - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+     *                          - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+     *                          Data type supported: Same as @p input.
+     * @param[in]  fc_info      (Optional) Fully connected layer additional info
+     * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    void configure(const ITensor          *input,
+                   const ITensor          *weights,
+                   const ITensor          *biases,
+                   ITensor                *output,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer
      *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
-     *                    If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                    If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                    Data type supported: Same as @p input.
-     * @param[in] biases  Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
-     *                    - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                    - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                    Data type supported: Same as @p input.
-     * @param[in] fc_info (Optional) Fully connected layer additional info
+     * Similar to @ref NEFullyConnectedLayer::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
+
+    /** Static function that queries whether fixed-format kernel exists for a given problem description
+     *
+     * @param[out] expected_weight_format Format in which weights should be for found fixed format kernel
+     * @param[in]  input                  Source tensor
+     * @param[in]  weights                Weights tensor.
+     * @param[in]  biases                 Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  output                 Destination tensor
+     * @param[in]  fc_info                Fully connected layer additional info
+     * @param[in]  weights_info           Describes weights shape
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    static Status has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                               const ITensorInfo             *input,
+                               const ITensorInfo             *weights,
+                               const ITensorInfo             *biases,
+                               const ITensorInfo             *output,
+                               const FullyConnectedLayerInfo &fc_info,
+                               const WeightsInfo             &weights_info);
 
     //Inherited methods override
     void run() override;
     void prepare() override;
 
 private:
-    void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-    void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-    void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-
-    MemoryGroup                                                         _memory_group;
-    IWeightsManager                                                    *_weights_manager;
-    NEFlattenLayer                                                      _flatten;
-    NEConvertFullyConnectedWeights                                      _convert_weights;
-    weights_transformations::NEConvertFullyConnectedWeightsManaged      _convert_weights_managed;
-    NETranspose                                                         _reshape_weights_function;
-    weights_transformations::NEFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function;
-    NEGEMM                                                              _mm_gemm;
-    NEGEMMLowpMatrixMultiplyCore                                        _mm_gemmlowp;
-    Tensor                                                              _flatten_output;
-    Tensor                                                              _converted_weights_output;
-    Tensor                                                              _reshape_weights_output;
-    const ITensor                                                      *_original_weights;
-    bool                                                                _are_weights_converted;
-    bool                                                                _are_weights_reshaped;
-    bool                                                                _is_fc_after_conv;
-    bool                                                                _is_quantized_asymmetric;
-    bool                                                                _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index 3dd7f49044..f53b3de7f6 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -75,9 +75,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -95,10 +102,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 6fa30bd545..29650a5eca 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,44 +24,18 @@
 #ifndef ARM_COMPUTE_NEGEMM_H
 #define ARM_COMPUTE_NEGEMM_H
 
-#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
-// Forward declarations
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMMatrixAdditionKernel;
-class NEGEMMMatrixMultiplyKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
-
 /** Basic function to execute GEMM. This function calls the following kernels:
  *
- * If optimized assembly is available:
- *  -# @ref cpu::CpuGemmAssemblyDispatch
- *  -# @ref NEActivationLayer (if alpha != 1.0)
- * Else:
- *  -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
- *  -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix)
- *  -# @ref NEGEMMMatrixMultiplyKernel
- * In both cases:
- *  -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
- * Else:
- *  -# @ref NEArithmeticAddition (if c != nullptr and is reshaped once and not optimized assembly in place)
- *
- *  -# @ref NEActivationLayer (if activation is specified in GEMMInfo)
+ *  -# @ref cpu::CpuGemm
  */
 class NEGEMM : public IFunction
 {
@@ -93,6 +67,8 @@ public:
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
      * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
      *
+     * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+     *
      * @param[in]  a         First input tensor  (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
      * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
@@ -102,51 +78,49 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensor  *a,
+                   const ITensor  *b,
+                   const ITensor  *c,
+                   ITensor        *d,
+                   float           alpha,
+                   float           beta,
+                   const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
-     * @param[in]  a         First input tensor info  (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32
-     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor info. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     * Similar to @ref NEGEMM::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
+     * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
+     * as in @ref NEGEMM::validate() except that all arguments are required.
+     *
+     * @return a status
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *output,
+                               float                      alpha,
+                               float                      beta,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                                   _memory_group;
-    IWeightsManager                              *_weights_manager;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>    _interleave_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>     _transpose_kernel;
-    std::unique_ptr<NEGEMMMatrixMultiplyKernel>   _mm_kernel;
-    std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _asm_glue;
-    std::unique_ptr<NEGEMMMatrixAdditionKernel>   _ma_kernel;
-    NEActivationLayer                             _alpha_scale_func;
-    NEArithmeticAddition                          _add_bias;
-    NEActivationLayer                             _activation_func;
-
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _tmp_d;
-    const ITensor *_original_b;
-    bool           _run_vector_matrix_multiplication;
-    bool           _run_alpha_scale;
-    bool           _run_addition;
-    bool           _run_bias_addition;
-    bool           _run_activation;
-    bool           _reshape_b_only_on_first_run;
-    bool           _is_prepared;
-
-    ITensorPack _asm_glue_tensors{};
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index f39ce4dfa3..d1c5a1c9b3 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -29,15 +29,12 @@
 #include "arm_compute/runtime/IMemoryManager.h"
 
 #include <memory>
+
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
 class ITensorInfo;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
@@ -89,7 +86,8 @@ public:
      *                     Data types supported: Same as @p input.
      * @param[in]  info    Convolution layer descriptor
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+    void
+    configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
      *
      * @param[in] input   Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -105,7 +103,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e89eae1d31..3e84c3e2cf 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,147 +24,31 @@
 #ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
-class NECol2ImKernel;
-class NEIm2ColKernel;
-class NEWeightsReshapeKernel;
-
-/** Function to reshape the weights. This function calls the following kernel:
- * -# @ref NEWeightsReshapeKernel
- */
-class NEConvolutionLayerReshapeWeights : public IFunction
-{
-public:
-    /** Constructor */
-    NEConvolutionLayerReshapeWeights() noexcept;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = delete;
-    /** Default destructor */
-    ~NEConvolutionLayerReshapeWeights();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: All.
-     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                     Data type supported: same as @p weights.
-     *                     @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output  Destination tensor. Data types supported: same as @p weights.
-     */
-    void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
-     *
-     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: All.
-     * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                    Data type supported: same as @p weights.
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output  Destination tensor. Data types supported: same as @p weights.
-     *
-     * @return an error status
-     */
-    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::unique_ptr<NEWeightsReshapeKernel> _weights_reshape_kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref NEConvolutionLayerReshapeWeights */
-class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights
-{
-public:
-    /** Constructor */
-    NEConvolutionLayerReshapeWeightsTransform() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerReshapeWeightsTransform(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerReshapeWeightsTransform &operator=(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionLayerReshapeWeightsTransform(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEConvolutionLayerReshapeWeightsTransform &operator=(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
-    /** Default destructor */
-    ~NEConvolutionLayerReshapeWeightsTransform() = default;
-    void configure(const ITensor *input, const ITensor *biases)
-    {
-        _bias_bit = (biases != nullptr) ? 1 : 0;
-        _func.configure(input, biases, &_output);
-    }
-
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    uint32_t uid() override
-    {
-        return ((0x8) | (_bias_bit << 7));
-    }
-
-    bool is_reshape_run()
-    {
-        return _reshape_run;
-    }
-
-private:
-    Tensor                           _output{};
-    NEConvolutionLayerReshapeWeights _func{};
-    int32_t                          _bias_bit{ 0 };
-};
-} // namespace weights_transformations
+class ITensorInfo;
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
- * -# @ref NEIm2ColKernel
- * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEArithmeticAddition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
- * -# @ref NECol2ImKernel (if NCHW data layout)
+ * -# @ref cpu::CpuGemmConv2d
  *
  */
 class NEGEMMConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager  = nullptr,
+                           IWeightsManager                       *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
@@ -192,118 +76,139 @@ public:
      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
      * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
      *
-     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                          Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
-     * @param[in] input        Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights      Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                         Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases       Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                         Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output       Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                         Data types supported: Same as @p input.
-     * @param[in] conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                         tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in] dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * @param[in] input            Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] output           Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    /** Configures the appropriate matrix multiply routine
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
+
+    /** Static function to check if there is an optimized version of
+     * GEMM available for the input parameters.
      *
-     * @param[in]  input         Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in]  weights       Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases        Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                           Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input,
-     *                           except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
-     */
-    void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+     * The method is intended to be used to find out the optimal
+     * memory layout to be used for the weights tensor when running
+     * variable weights execution.
      *
-     * @param[in] input         Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights       Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases        Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input,
-     *                          except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
-     * @param[in] act_info      (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
-     * @param[in] skip_im2col   (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+     * The user can query the database of optimised kernels in
+     * arm_gemm by specifying one of the enumerations of
+     * arm_compute::WeightFormat in the weight_format field of the input
+     * parameter weights_info. In case of success, the method
+     * writes the expected format in the output parameter
+     * expected_weight_format. The expected_weight_format can than be
+     * used in the configure method of the class for retrieving the
+     * best optimal kernel.
      *
-     * @return a status
-     */
-    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                              int gemm_3d_depth = 1, bool skip_im2col = false);
-    /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore
+     * Use case one - query for a specific format:
      *
-     * @param[in] input_info    Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights_info  Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] act_info      Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] gemm_3d_depth Depth of GEMM 3D
-     * @param[in] skip_im2col   Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+     *     WeightInfo weights_info(..., arm_compute::WeightFormat::OHWIo4, ...); // Set the value of the input query.
+     *     if (NEGEMMConvolutionlayer::has_opt_impl(WeightFormat(), ...., weights_info, ...))
+     *     {
+     *       auto conv = std::unique_ptr<NEGEMMConvolutionlayer>();
+     *       conv->configure(..., weights_info, ...);  // uses the same WeightFormat the user wanted originally, OHWYo4.
+     *       conv->run(...);
+     *     }
      *
-     * @return a status
+     * Use case two - query for any format that would be optimal for the GEMM to execute:
+     *
+     *     WeightInfo weights_info(..., arm_compute::WeightFormat::ANY, ...); // Set the value of the input query.
+     *     arm_compute::WeightFormat expected_wf;
+     *     if (NEGEMMConvolutionlayer::has_opt_impl(expected_wf, ...., weights_info, ...))
+     *     {
+     *       auto conv = std::unique_ptr<NEGEMMConvolutionlayer>();
+     *       // ... code to convert the layout of the weights tensor to the layout returned by has_opt_impl
+     *       WeightInfo new_weights_info(..., expected_wf, ...); // Set the value of the WeightFormat returned by has_opt_impl.
+     *       conv->configure(..., new_weights_info, ...);
+     *       conv->run(...);
+     *     }
+     *
+     * Notice that a GEMM configured with a WeightFormat other than
+     * UNSPECIFIED will run GEMM with variable weights mode.
+     *
+     * @param[out] expected_weight_format The arm_compute::WeightFormat expected by the kernel.
+     * @param[in]  src                    Source tensor info.
+     * @param[in]  weights                Weights tensor info.
+     * @param[in]  biases                 Biases tensor info. Shared biases supported.
+     * @param[in]  dst                    Destination tensor info.
+     * @param[in]  conv_info              Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info           (optional) Specifies additional configuration parameters for the weights of the GEMM computation.
+     * @param[in]  dilation               (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info               (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. And no activation (i.e. Linear) which is the default value.
+     * @param[in]  enable_fast_math       (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *
+     * @return a Status
      */
-    static Status validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               bool                       enable_fast_math = false);
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
 
 private:
-    MemoryGroup                                                        _memory_group;
-    IWeightsManager                                                   *_weights_manager;
-    NEConvolutionLayerReshapeWeights                                   _reshape_weights;
-    weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
-    std::unique_ptr<NEIm2ColKernel>                                    _im2col_kernel;
-    NEGEMM                                                             _mm_gemm;
-    NEGEMMLowpMatrixMultiplyCore                                       _mm_gemmlowp;
-    std::unique_ptr<NECol2ImKernel>                                    _col2im_kernel;
-    NEReshapeLayer                                                     _reshape_layer;
-
-    const ITensor *_original_weights;
-    const ITensor *_original_output;
-
-    Tensor _im2col_output;
-    Tensor _weights_reshaped;
-    Tensor _gemm_output;
-    Tensor _gemm_output_3d;
-    Tensor _tmp_output;
-
-    DataLayout _data_layout;
-
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H */
+#endif /* ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index dc9783f9eb..6d07675d3d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,53 +21,34 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "NEActivationLayer.h"
-#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
-class NEConvertQuantizedSignednessKernel;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-class NEGEMMLowpOffsetContributionKernel;
-class NEGEMMLowpOffsetContributionOutputStageKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-class NEGEMMLowpMatrixBReductionKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
+class ITensorInfo;
 
-/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
- *
- *  -# @ref NEGEMMInterleave4x4Kernel
- *  -# @ref NEGEMMTranspose1xWKernel
- *  -# @ref NEGEMMLowpMatrixMultiplyKernel
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *  -# @ref NEActivationLayer
+/** Function to run Gemm on quantized types.
  *
- * otherwise if the DOT product instruction is available:
+ *  This function calls the following:
  *
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *
-*/
+ * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore
+ */
 class NEGEMMLowpMatrixMultiplyCore : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                                 IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move constructor */
@@ -99,6 +80,7 @@ public:
      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
      * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
      * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |F32      |F32            |
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
      *  This kernel performs the following computations:
@@ -107,71 +89,36 @@ public:
      *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
+     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise
      *
      * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[out] output    Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
+     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32/F32
+     * @param[out] output    Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(
+        const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
-     *
-     * @param[in] a         First input tensor info  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[in] output    Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                      if the reshape of matrix B should be executed only for the first run
+     * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                                                    _memory_group;
-    IWeightsManager                                               *_weights_manager;
-    std::unique_ptr<cpu::CpuGemmAssemblyDispatch>                  _asm_glue;
-    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                _mm_kernel;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>                      _mtx_b_reshape_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-    NEActivationLayer                                              _activation_func;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
-
-    Tensor         _vector_sum_col;
-    Tensor         _vector_sum_row;
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _mm_result_s32;
-    Tensor         _signed_a;
-    Tensor         _signed_output;
-    const ITensor *_original_b;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-
-    bool _run_vector_matrix_multiplication;
-    bool _assembly_path;
-    bool _fused_assembly_path;
-    bool _reshape_b_only_on_first_run;
-    bool _is_prepared;
-    bool _fuse_output_stage;
-    bool _run_activation;
-    bool _flip_signedness;
-
-    ITensorPack _asm_glue_tensors{};
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index fa5f5e3826..0d932bb4af 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
 
 /** This file contains all available output stages for GEMMLowp.
  *
@@ -39,237 +39,17 @@ namespace arm_compute
 {
 class ITensor;
 class ITensorInfo;
-
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint.
- *
- *  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint();
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint.
- *
- *  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint();
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint.
- *
- *  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
-    /** Default destructor */
-    ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint();
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
 /** Basic function to execute GEMMLowpQuantizeDown kernels.
  *
- *  This function calls the following kernels:
+ *  This function calls the following operators:
  *
- * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ * -# @ref cpu::CpuGemmLowpOutputStage
 */
-class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder
+class NEGEMMLowpOutputStage : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowpOutputStage() = default;
+    NEGEMMLowpOutputStage();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -309,7 +89,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h
index 393a38ee4d..9c7ae0134d 100644
--- a/arm_compute/runtime/NEON/functions/NEGather.h
+++ b/arm_compute/runtime/NEON/functions/NEGather.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,18 +49,17 @@ public:
      * |All            |All            |
      *
      * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+     * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the following type: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
+     *                     @note The "axis" must be in the range [0, input.rank -1] when indices is a vector, and must be 1 when indices is a 2D or 3D tensor.
      * @param[out] output  Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+     *
      */
     void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+     * Similar to @ref NEGather::configure()
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 3b683382ec..0f294fde22 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -95,7 +95,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+    void configure(const ITensor               *scores,
+                   const ITensor               *deltas,
+                   const ITensor               *anchors,
+                   ITensor                     *proposals,
+                   ITensor                     *scores_out,
+                   ITensor                     *num_valid_proposals,
                    const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer
@@ -112,7 +117,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index bb0697072b..0bc57be09e 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -89,7 +89,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma   = 1.0f,
+                           float              beta    = 0.0f,
+                           float              epsilon = 1e-12f);
 
     // Inherited methods overridden:
     void run() override;
@@ -103,5 +107,5 @@ private:
     Tensor                                              _permuted_input;
     Tensor                                              _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 7f1a5e785e..8502cee5d2 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -97,5 +97,5 @@ private:
     std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel;
     Tensor                                    _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 075fb4530a..629c5d10a0 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -25,7 +25,8 @@
 #define ARM_COMPUTE_NELSTMLAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -35,7 +36,6 @@
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
 {
@@ -104,13 +104,26 @@ public:
      * @param[in]  projection_threshold        The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *output_state_in, const ITensor *cell_state_in,
-                   ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                   const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *output_state_in,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *scratch_buffer,
+                   ITensor                   *output_state_out,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output,
+                   const LSTMParams<ITensor> &lstm_params,
+                   const ActivationLayerInfo &activation_info,
+                   float                      cell_threshold       = 0.f,
+                   float                      projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -151,13 +164,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 2f0c753691..ae951669b3 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -38,8 +39,6 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
-
 namespace arm_compute
 {
 // Forward declarations
@@ -50,7 +49,7 @@ class ITensor;
  * This function calls the following functions/kernels:
  *
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
+ * -# @ref NEGEMMLowpOutputStage                                 Convert 32-bit integers into QSYMM16
  * -# @ref NETranspose                                           Matrix transpose
  * -# @ref NEConcatenateLayer                                    Tensor concatenation
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
@@ -104,11 +103,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ITensor *input,
-                   const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out);
+                   const ITensor *input_to_input_weights,
+                   const ITensor *input_to_forget_weights,
+                   const ITensor *input_to_cell_weights,
+                   const ITensor *input_to_output_weights,
+                   const ITensor *recurrent_to_input_weights,
+                   const ITensor *recurrent_to_forget_weights,
+                   const ITensor *recurrent_to_cell_weights,
+                   const ITensor *recurrent_to_output_weights,
+                   const ITensor *input_gate_bias,
+                   const ITensor *forget_gate_bias,
+                   const ITensor *cell_bias,
+                   const ITensor *output_gate_bias,
+                   ITensor       *cell_state_in,
+                   const ITensor *output_state_in,
+                   ITensor       *cell_state_out,
+                   ITensor       *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -133,11 +143,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
@@ -147,30 +168,30 @@ private:
     MemoryGroup _memory_group;
 
     // Functions used
-    NEGEMMLowpMatrixMultiplyCore                        _gemmlowp;
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
-    NETranspose                                         _transpose_weights;
-    NEConcatenateLayer                                  _concat_input_weights;
-    NEConcatenateLayer                                  _concat_recurrent_weights;
-    NEConcatenateLayer                                  _concat_weights;
-    NEConcatenateLayer                                  _concat_inputs;
-    NEConcatenateLayer                                  _concat_bias;
-    NEActivationLayer                                   _sigmoid_forget_gate;
-    NEActivationLayer                                   _sigmoid_input_gate;
-    NEActivationLayer                                   _sigmoid_output_gate;
-    NEActivationLayer                                   _tanh_modulation_gate;
-    NEActivationLayer                                   _tanh_output_state;
-    NEArithmeticAddition                                _add1;
-    NEArithmeticAddition                                _add2;
-    NEPixelWiseMultiplication                           _mul1;
-    NEPixelWiseMultiplication                           _mul2;
-    NEPixelWiseMultiplication                           _mul3;
-    NESlice                                             _slice_input_tensor;
-    NESlice                                             _slice_forget_tensor;
-    NESlice                                             _slice_cell_tensor;
-    NESlice                                             _slice_output_tensor;
-    NEDequantizationLayer                               _dequantize;
-    NEQuantizationLayer                                 _quantize;
+    NEGEMMLowpMatrixMultiplyCore _gemmlowp;
+    NEGEMMLowpOutputStage        _output_stage;
+    NETranspose                  _transpose_weights;
+    NEConcatenateLayer           _concat_input_weights;
+    NEConcatenateLayer           _concat_recurrent_weights;
+    NEConcatenateLayer           _concat_weights;
+    NEConcatenateLayer           _concat_inputs;
+    NEConcatenateLayer           _concat_bias;
+    NEActivationLayer            _sigmoid_forget_gate;
+    NEActivationLayer            _sigmoid_input_gate;
+    NEActivationLayer            _sigmoid_output_gate;
+    NEActivationLayer            _tanh_modulation_gate;
+    NEActivationLayer            _tanh_output_state;
+    NEArithmeticAddition         _add1;
+    NEArithmeticAddition         _add2;
+    NEPixelWiseMultiplication    _mul1;
+    NEPixelWiseMultiplication    _mul2;
+    NEPixelWiseMultiplication    _mul3;
+    NESlice                      _slice_input_tensor;
+    NESlice                      _slice_forget_tensor;
+    NESlice                      _slice_cell_tensor;
+    NESlice                      _slice_output_tensor;
+    NEDequantizationLayer        _dequantize;
+    NEQuantizationLayer          _quantize;
 
     // Tensor pointers
     const ITensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
new file mode 100644
index 0000000000..58dd7a6f6b
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Settings for MatMul Cpu implementation*/
+class CpuMatMulSettings
+{
+public:
+    // get fast math flag
+    bool fast_math() const
+    {
+        return _fast_math;
+    }
+    // get fixed format flag
+    bool fixed_format() const
+    {
+        return _fixed_format;
+    }
+    // Set fast math flag
+    CpuMatMulSettings &fast_math(bool fmath)
+    {
+        _fast_math = fmath;
+        return *this;
+    }
+    // Set fixed format flag
+    CpuMatMulSettings &fixed_format(bool fixed_format)
+    {
+        _fixed_format = fixed_format;
+        return *this;
+    }
+
+private:
+    bool _fast_math{false};
+    bool _fixed_format{false};
+};
+
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class MatMulInfo;
+class Status;
+
+/** Basic function to run the following operators:
+ *
+ * -# @ref cpu::CpuMatMul
+ */
+class NEMatMul : public IFunction
+{
+public:
+    /** Constructor */
+    NEMatMul();
+    /** Destructor */
+    ~NEMatMul();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMatMul(const NEMatMul &) = delete;
+    /** Default move constructor */
+    NEMatMul(NEMatMul &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMatMul &operator=(const NEMatMul &) = delete;
+    /** Default move assignment operator */
+    NEMatMul &operator=(NEMatMul &&) = default;
+    /** Initialize
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs                |dst            |
+     * |:--------------|:------------------|:--------------|
+     * |F32            |F32                |F32            |
+     * |F16            |F16                |F16            |
+     * |BFLOAT16       |BFLOAT16           |BFLOAT16       |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8            |QASYMM8        |
+     *
+     * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings Contains flags for function level settings i.e fast math
+     * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+     */
+    void configure(ITensor                   *lhs,
+                   ITensor                   *rhs,
+                   ITensor                   *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
+     *
+     * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst      Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings Contains flags for function level settings i.e fast math
+     * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+     *
+     * @return Status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index 41ea040457..e00fc4544f 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -33,12 +34,10 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 class NEFill;
-class NEMaxUnpoolingLayerKernel;
 
 /** Function to perform MaxUnpooling. This function calls the following kernels:
  *
  * -# @ref NEFill
- * -# @ref NEMaxUnpoolingLayerKernel
  */
 class NEMaxUnpoolingLayer : public IFunction
 {
@@ -88,14 +87,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    std::unique_ptr<NEFill>                    _fill_func;
-    std::unique_ptr<NEMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
+    std::unique_ptr<NEFill> _fill_func;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index fbe000445c..27e3fa674e 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NENORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
@@ -88,16 +87,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                                 _memory_group;  /**< Function memory group */
-    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;   /**< Normalization layer kernel */
-    NEPixelWiseMultiplication                   _multiply_f;    /**< Pixel multiplication function */
-    Tensor                                      _input_squared; /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                                 _memory_group; /**< Function memory group */
+    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;  /**< Normalization layer kernel */
+    NEPixelWiseMultiplication                   _multiply_f;   /**< Pixel multiplication function */
+    Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index 4aa6725496..494b1c0641 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_NEPADLAYER_H
 #define ARM_COMPUTE_NEPADLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 #include "arm_compute/runtime/SubTensor.h"
-
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -82,7 +82,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -95,7 +99,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
@@ -109,7 +117,10 @@ private:
      *                            specifies the front and the end padding in the i-th dimension.
      * @param[in]  constant_value Constant value to be used for the padding
      */
-    void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value);
+    void configure_constant_mode(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value);
     /** Configure functions for when reflect or symmetric padding is used.
      *
      * @param[in]  input  Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index c863fde0ac..2cef64764d 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEPERMUTE_H
 #define ARM_COMPUTE_NEPERMUTE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 4684c2d4b8..3d81bf6087 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,9 @@
 #ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
 #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
 
+#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
@@ -93,7 +95,12 @@ public:
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
@@ -120,7 +127,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -156,7 +168,10 @@ public:
      * @param[out]     output   The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
      *
      * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -164,7 +179,10 @@ public:
      * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
new file mode 100644
index 0000000000..09251f2a5f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPOOLING3DLAYER_H
+#define ARM_COMPUTE_NEPOOLING3DLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class IMemoryManager;
+/** Basic function to simulate a pooling 3d layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref cpu::CpuPool3d
+ */
+class NEPooling3dLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPooling3dLayer(const NEPooling3dLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPooling3dLayer &operator=(const NEPooling3dLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPooling3dLayer(NEPooling3dLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPooling3dLayer &operator=(NEPooling3dLayer &&) = delete;
+    /** Default destructor */
+    ~NEPooling3dLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     *
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output    Destination tensor.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer
+     *
+     *
+     * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output    Destination tensor info.
+     * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 9398e1fce9..768ad0d818 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -71,6 +71,8 @@ public:
      * |F32            |F32            |
      *
      * @note F16 is supported for pool sizes 2 and 3 only
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *       Cases where pooling region is completely outside input tensor are only supported for floating point data type
      *
      * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
@@ -89,7 +91,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
@@ -98,5 +103,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 38e0c9f3ad..858e3299af 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -62,7 +62,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 7c2e9bc5a1..009a4e0911 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,17 @@
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 #include <memory>
 
@@ -43,8 +45,13 @@ namespace arm_compute
 class ITensor;
 class ITensorInfo;
 class NEQLSTMLayerNormalizationKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace cpu
 /** Basic function to run @ref NEQLSTMLayer
  *
  * This function calls the following kernels:
@@ -54,8 +61,8 @@ class NEGEMMLowpMatrixAReductionKernel;
  * -# @ref NEArithmeticSubtraction                               Elementwise subtraction
  * -# @ref NECopy                                                Copy kernel for copying output_state_out to output
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref NEGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
+ * -# @ref NEGEMMLowpOutputStage                                 Convert 32-bit integers into QSYMM16
+ * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel            For precomputing effective biases to use
  * -# @ref NEPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref NETranspose                                           Transpose function for reshaping the weights
  * */
@@ -123,12 +130,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *cell_state_in, ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *output_state_in,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output_state_out,
+                   ITensor                   *output,
                    const LSTMParams<ITensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer
@@ -173,12 +189,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -211,10 +236,17 @@ private:
      * @param[in] mm_res_info    Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res,
-                      Tensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                      NEGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ITensor                *mm_input,
+                      const ITensor                *mm_weights,
+                      const ITensor                *bias,
+                      Tensor                       *mm_res,
+                      Tensor                       *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
     MemoryGroup _memory_group;
 
@@ -223,8 +255,8 @@ private:
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ITensor *_src{ nullptr };
-        ITensor *_dst{ nullptr };
+        ITensor *_src{nullptr};
+        ITensor *_dst{nullptr};
         size_t   _row_size{};
         Window   _window{};
 
@@ -250,70 +282,73 @@ private:
     };
 
     // Functions used
-    NETranspose                                       _transpose_input_to_forget_weights;
-    NETranspose                                       _transpose_input_to_cell_weights;
-    NETranspose                                       _transpose_input_to_output_weights;
-    NETranspose                                       _transpose_input_to_input_weights;
-    NETranspose                                       _transpose_recurrent_to_forget_weights;
-    NETranspose                                       _transpose_recurrent_to_cell_weights;
-    NETranspose                                       _transpose_recurrent_to_output_weights;
-    NETranspose                                       _transpose_recurrent_to_input_weights;
-    NETranspose                                       _transpose_projection_weights;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction;
-    NEArithmeticAddition                              _projection_bias_add;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_forget;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_forget;
-    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_forget;
-    NEGEMMLowpOutputStage                             _input_to_forget_outstage;
-    NEGEMMLowpOutputStage                             _recurrent_to_forget_outstage;
-    NEGEMMLowpOutputStage                             _cell_to_forget_outstage;
-    NEArithmeticAddition                              _accumulate_input_recurrent_forget;
-    NEArithmeticAddition                              _accumulate_cell_forget;
-    NEActivationLayer                                 _forget_gate_sigmoid;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_cell;
-    NEGEMMLowpOutputStage                             _input_to_cell_outstage;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_cell;
-    NEGEMMLowpOutputStage                             _recurrent_to_cell_outstage;
-    NEArithmeticAddition                              _accumulate_input_recurrent_modulation;
-    NEActivationLayer                                 _cell_gate_tanh;
-    NEArithmeticSubtraction                           _input_gate_sub;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_input;
-    NEGEMMLowpOutputStage                             _input_to_input_outstage;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_input;
-    NEGEMMLowpOutputStage                             _recurrent_to_input_outstage;
-    NEArithmeticAddition                              _accumulate_input_recurrent_input;
-    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_input;
-    NEGEMMLowpOutputStage                             _cell_to_input_outstage;
-    NEArithmeticAddition                              _accumulate_cell_input;
-    NEActivationLayer                                 _input_gate_sigmoid;
-    NEPixelWiseMultiplication                         _pixelwise_mul_forget_cell;
-    NEPixelWiseMultiplication                         _pixelwise_mul_input_cell;
-    NEArithmeticAddition                              _add_forget_cell;
-    NEActivationLayer                                 _cell_clip;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_output;
-    NEGEMMLowpOutputStage                             _input_to_output_outstage;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_output;
-    NEGEMMLowpOutputStage                             _recurrent_to_output_outstage;
-    NEArithmeticAddition                              _accumulate_input_recurrent_output;
-    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_output;
-    NEGEMMLowpOutputStage                             _cell_to_output_outstage;
-    NEArithmeticAddition                              _accumulate_cell_to_output;
-    NEActivationLayer                                 _output_gate_sigmoid;
-    NEActivationLayer                                 _hidden_tanh;
-    NEPixelWiseMultiplication                         _pixelwise_mul_hidden;
-    NEGEMMLowpOutputStage                             _hidden_outstage;
-    NEGEMMLowpMatrixMultiplyCore                      _mm_projection;
-    NEGEMMLowpOutputStage                             _projection_outstage;
-    NEArithmeticAddition                              _accumulate_projection;
-    NEActivationLayer                                 _projection_clip;
+
+    NEDequantizationLayer                                            _dequantize_input_to_forget_weights;
+    NEQuantizationLayer                                              _quantize_input_to_forget_weights;
+    NETranspose                                                      _transpose_input_to_forget_weights;
+    NETranspose                                                      _transpose_input_to_cell_weights;
+    NETranspose                                                      _transpose_input_to_output_weights;
+    NETranspose                                                      _transpose_input_to_input_weights;
+    NETranspose                                                      _transpose_recurrent_to_forget_weights;
+    NETranspose                                                      _transpose_recurrent_to_cell_weights;
+    NETranspose                                                      _transpose_recurrent_to_output_weights;
+    NETranspose                                                      _transpose_recurrent_to_input_weights;
+    NETranspose                                                      _transpose_projection_weights;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction;
+    NEArithmeticAddition                                             _projection_bias_add;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_forget;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_forget;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_forget;
+    NEGEMMLowpOutputStage                                            _input_to_forget_outstage;
+    NEGEMMLowpOutputStage                                            _recurrent_to_forget_outstage;
+    NEGEMMLowpOutputStage                                            _cell_to_forget_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_forget;
+    NEArithmeticAddition                                             _accumulate_cell_forget;
+    NEActivationLayer                                                _forget_gate_sigmoid;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_cell;
+    NEGEMMLowpOutputStage                                            _input_to_cell_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_cell;
+    NEGEMMLowpOutputStage                                            _recurrent_to_cell_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_modulation;
+    NEActivationLayer                                                _cell_gate_tanh;
+    NEArithmeticSubtraction                                          _input_gate_sub;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_input;
+    NEGEMMLowpOutputStage                                            _input_to_input_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_input;
+    NEGEMMLowpOutputStage                                            _recurrent_to_input_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_input;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_input;
+    NEGEMMLowpOutputStage                                            _cell_to_input_outstage;
+    NEArithmeticAddition                                             _accumulate_cell_input;
+    NEActivationLayer                                                _input_gate_sigmoid;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_forget_cell;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_input_cell;
+    NEArithmeticAddition                                             _add_forget_cell;
+    NEActivationLayer                                                _cell_clip;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_output;
+    NEGEMMLowpOutputStage                                            _input_to_output_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_output;
+    NEGEMMLowpOutputStage                                            _recurrent_to_output_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_output;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_output;
+    NEGEMMLowpOutputStage                                            _cell_to_output_outstage;
+    NEArithmeticAddition                                             _accumulate_cell_to_output;
+    NEActivationLayer                                                _output_gate_sigmoid;
+    NEActivationLayer                                                _hidden_tanh;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_hidden;
+    NEGEMMLowpOutputStage                                            _hidden_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_projection;
+    NEGEMMLowpOutputStage                                            _projection_outstage;
+    NEArithmeticAddition                                             _accumulate_projection;
+    NEActivationLayer                                                _projection_clip;
 
     TensorCopyKernel _projection_bias_copy;
     TensorCopyKernel _projection_output_to_accumulate_copy;
@@ -325,19 +360,16 @@ private:
     NECopy _copy_output;
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights
-    {
-        nullptr
-    };
-    const ITensor *_recurrent_to_input_weights{ nullptr };
-    const ITensor *_projection_bias{ nullptr };
-    const ITensor *_input_to_forget_weights{ nullptr };
-    const ITensor *_input_to_cell_weights{ nullptr };
-    const ITensor *_input_to_output_weights{ nullptr };
-    const ITensor *_recurrent_to_forget_weights{ nullptr };
-    const ITensor *_recurrent_to_cell_weights{ nullptr };
-    const ITensor *_recurrent_to_output_weights{ nullptr };
-    const ITensor *_projection_weights{ nullptr };
+    const ITensor                                 *_input_to_input_weights{nullptr};
+    const ITensor                                 *_recurrent_to_input_weights{nullptr};
+    const ITensor                                 *_projection_bias{nullptr};
+    const ITensor                                 *_input_to_forget_weights{nullptr};
+    const ITensor                                 *_input_to_cell_weights{nullptr};
+    const ITensor                                 *_input_to_output_weights{nullptr};
+    const ITensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ITensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ITensor                                 *_recurrent_to_output_weights{nullptr};
+    const ITensor                                 *_projection_weights{nullptr};
     std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
     std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
@@ -372,63 +404,66 @@ private:
         return _layer_norms[getGateIndex(g)];
     }
 
-    void configure_layer_norm(LayerNormGate g, const ITensor *in);
+    void          configure_layer_norm(LayerNormGate g, const ITensor *in);
     static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    Tensor _input_to_forget_weights_transposed{ nullptr };
-    Tensor _input_to_cell_weights_transposed{ nullptr };
-    Tensor _input_to_output_weights_transposed{ nullptr };
-    Tensor _input_to_input_weights_transposed{ nullptr };
-    Tensor _recurrent_to_forget_weights_transposed{ nullptr };
-    Tensor _recurrent_to_cell_weights_transposed{ nullptr };
-    Tensor _recurrent_to_output_weights_transposed{ nullptr };
-    Tensor _recurrent_to_input_weights_transposed{ nullptr };
-    Tensor _projection_weights_transposed{ nullptr };
-    Tensor _input_to_input_eff_bias{ nullptr };
-    Tensor _recurrent_to_input_eff_bias{ nullptr };
-    Tensor _input_to_forget_eff_bias{ nullptr };
-    Tensor _recurrent_to_forget_eff_bias{ nullptr };
-    Tensor _input_to_cell_eff_bias{ nullptr };
-    Tensor _recurrent_to_cell_eff_bias{ nullptr };
-    Tensor _input_to_output_eff_bias{ nullptr };
-    Tensor _recurrent_to_output_eff_bias{ nullptr };
-    Tensor _projection_reduction_res{ nullptr };
-    Tensor _projection_eff_bias{ nullptr };
-    Tensor _mm_input_to_forget_res{ nullptr };
-    Tensor _mm_recurrent_to_forget_res{ nullptr };
-    Tensor _mul_cell_to_forget_res{ nullptr };
-    Tensor _input_to_forget_outstage_res{ nullptr };
-    Tensor _cell_to_forget_outstage_res{ nullptr };
-    Tensor _recurrent_to_forget_outstage_res{ nullptr };
-    Tensor _forget_gate{ nullptr };
-    Tensor _mm_input_to_cell_res{ nullptr };
-    Tensor _input_to_cell_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_cell_res{ nullptr };
-    Tensor _recurrent_to_cell_outstage_res{ nullptr };
-    Tensor _cell_gate{ nullptr };
-    Tensor _mul_input_cell_res{ nullptr };
-    Tensor _mm_input_to_input_res{ nullptr };
-    Tensor _input_to_input_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_input_res{ nullptr };
-    Tensor _mul_cell_to_input_res{ nullptr };
-    Tensor _cell_to_input_outstage_res{ nullptr };
-    Tensor _recurrent_to_input_outstage_res{ nullptr };
-    Tensor _input_gate{ nullptr };
-    Tensor _mm_input_to_output_res{ nullptr };
-    Tensor _input_to_output_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_output_res{ nullptr };
-    Tensor _mul_cell_to_output_res{ nullptr };
-    Tensor _cell_to_output_outstage_res{ nullptr };
-    Tensor _recurrent_to_output_outstage_res{ nullptr };
-    Tensor _output_gate{ nullptr };
-    Tensor _hidden_mul_res{ nullptr };
-    Tensor _hidden_gate{ nullptr };
-    Tensor _mm_projection_res{ nullptr };
-    Tensor _projection_outstage_res{ nullptr };
-    Tensor _projection_out_res{ nullptr };
-    Tensor _projection_accumulate_res{ nullptr };
-    Tensor _ones{ nullptr };
+    Tensor _input_to_forget_weights_f32{nullptr};
+    Tensor _input_to_forget_weights_symm8{nullptr};
+
+    Tensor                                _input_to_forget_weights_transposed{nullptr};
+    Tensor                                _input_to_cell_weights_transposed{nullptr};
+    Tensor                                _input_to_output_weights_transposed{nullptr};
+    Tensor                                _input_to_input_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_output_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_input_weights_transposed{nullptr};
+    Tensor                                _projection_weights_transposed{nullptr};
+    Tensor                                _input_to_input_eff_bias{nullptr};
+    Tensor                                _recurrent_to_input_eff_bias{nullptr};
+    Tensor                                _input_to_forget_eff_bias{nullptr};
+    Tensor                                _recurrent_to_forget_eff_bias{nullptr};
+    Tensor                                _input_to_cell_eff_bias{nullptr};
+    Tensor                                _recurrent_to_cell_eff_bias{nullptr};
+    Tensor                                _input_to_output_eff_bias{nullptr};
+    Tensor                                _recurrent_to_output_eff_bias{nullptr};
+    Tensor                                _projection_reduction_res{nullptr};
+    Tensor                                _projection_eff_bias{nullptr};
+    Tensor                                _mm_input_to_forget_res{nullptr};
+    Tensor                                _mm_recurrent_to_forget_res{nullptr};
+    Tensor                                _mul_cell_to_forget_res{nullptr};
+    Tensor                                _input_to_forget_outstage_res{nullptr};
+    Tensor                                _cell_to_forget_outstage_res{nullptr};
+    Tensor                                _recurrent_to_forget_outstage_res{nullptr};
+    Tensor                                _forget_gate{nullptr};
+    Tensor                                _mm_input_to_cell_res{nullptr};
+    Tensor                                _input_to_cell_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_cell_res{nullptr};
+    Tensor                                _recurrent_to_cell_outstage_res{nullptr};
+    Tensor                                _cell_gate{nullptr};
+    Tensor                                _mul_input_cell_res{nullptr};
+    Tensor                                _mm_input_to_input_res{nullptr};
+    Tensor                                _input_to_input_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_input_res{nullptr};
+    Tensor                                _mul_cell_to_input_res{nullptr};
+    Tensor                                _cell_to_input_outstage_res{nullptr};
+    Tensor                                _recurrent_to_input_outstage_res{nullptr};
+    Tensor                                _input_gate{nullptr};
+    Tensor                                _mm_input_to_output_res{nullptr};
+    Tensor                                _input_to_output_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_output_res{nullptr};
+    Tensor                                _mul_cell_to_output_res{nullptr};
+    Tensor                                _cell_to_output_outstage_res{nullptr};
+    Tensor                                _recurrent_to_output_outstage_res{nullptr};
+    Tensor                                _output_gate{nullptr};
+    Tensor                                _hidden_mul_res{nullptr};
+    Tensor                                _hidden_gate{nullptr};
+    Tensor                                _mm_projection_res{nullptr};
+    Tensor                                _projection_outstage_res{nullptr};
+    Tensor                                _projection_out_res{nullptr};
+    Tensor                                _projection_accumulate_res{nullptr};
+    Tensor                                _ones{nullptr};
     std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
@@ -436,14 +471,15 @@ private:
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
+    bool _convert_input_to_forget_weights_to_qsymm8{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 667d3144ac..af7f464ac9 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -72,7 +72,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info);
+    void configure(const ITensor       *input,
+                   const ITensor       *weights,
+                   const ITensor       *recurrent_weights,
+                   const ITensor       *bias,
+                   ITensor             *hidden_state,
+                   ITensor             *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -85,7 +91,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index ea1af4daea..b06ebe899d 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -77,7 +77,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 2992b3eb95..929111ad4b 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -73,7 +74,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -91,7 +93,10 @@ public:
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index cb14c8fdde..609456a4ef 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 7512115a3f..5b8d8cdf2b 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,9 @@
 #ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H
 #define ARM_COMPUTE_NEON_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -83,7 +80,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
@@ -93,13 +91,8 @@ private:
     std::vector<NEReductionOperation> _reduction_kernels;
     std::vector<Tensor>               _reduced_outs;
     NEReshapeLayer                    _reshape;
-    NEDequantizationLayer             _dequant;
-    NEQuantizationLayer               _requant;
     int                               _reduction_ops;
     bool                              _keep_dims;
-    bool                              _do_requant;
-    Tensor                            _input_no_quant;
-    Tensor                            _output_no_quant;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 533c10adcf..f5391a6d0e 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_NEREDUCTIONOPERATION_H
 
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -88,7 +88,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
deleted file mode 100644
index 271ac9739b..0000000000
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAP_H
-#define ARM_COMPUTE_NEREMAP_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute remap. This function calls the following kernels:
- *
- * -# @ref NERemapKernel
- */
-class NERemap : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0   |src1   |src2   |dst    |
-     * |:------|:------|:------|:------|
-     * |U8     |F32    |F32    |U 8    |
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out]     output                Output tensor. Data type supported: U8.
-     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]      border_mode           Border mode to use on the input tensor.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0.
-     *
-     */
-    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEREMAP_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
new file mode 100644
index 0000000000..e3fa7b9c16
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class NEReorderKernel;
+/** Function to compute blocked reorder. */
+class NEReorderLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEReorderLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorderLayer(const NEReorderLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorderLayer &operator=(const NEReorderLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReorderLayer(NEReorderLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReorderLayer &operator=(NEReorderLayer &&) = delete;
+    /** Default destructor */
+    ~NEReorderLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     *
+     * @param[in]  input     Source tensor. Data type supported: F32. Data layouts supported: NCHW.
+     * @param[out] output    Destination with the same dimensions, data type, data layout as  @p input
+     *                       except last dimension of data layout which needs to be multiple of blocking parameter ksize
+     * @param[in]  input_wf  WeightFormat of input.
+     * @param[in]  output_wf WeightFormat of output.
+     */
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer
+     *
+     * Similar to @ref NEReorderLayer::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<NEReorderKernel> _reorder_kernel; /**< Reorder layer kernel */
+};
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */
+
+#endif // defined(__aarch64__)
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index c02fff54a5..e03e415068 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREVERSE_H
-#define ARM_COMPUTE_NEREVERSE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
@@ -45,22 +44,33 @@ public:
      * Valid data type configurations:
      * |src0           |src1           |dst            |
      * |:--------------|:--------------|:--------------|
-     * |All            |U32            |All            |
+     * |All            |U32, S32       |All            |
+     *
+     * @param[in]  input             Input tensor. Data types supported: All
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
+     *
+     * @note The value of each axis should be between [-rank, rank)
+     * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+     *
+     * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
      *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
      */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+    void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           const bool         use_inverted_axis = false);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 0b7dddacb2..72dfa3bda4 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NESCALEIMAGE_H
 #define ARM_COMPUTE_NESCALEIMAGE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -66,16 +65,19 @@ public:
      * |F16            |F16            |
      * |F32            |F32            |
      * |U8             |U8             |
+     * |S8             |S8             |
      * |S16            |S16            |
      *
-     * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
+     *
+     * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear
      */
     void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
-     * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] info   @ref ScaleKernelInfo to be used for validation
      *
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index ac79a5c633..70a688d3b0 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -85,7 +85,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 
     // Inherited methods overridden:
     void run() override;
@@ -129,7 +130,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index ad8c1467d0..5dee61a4a8 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H
 #define ARM_COMPUTE_NESPACETOBATCHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/Types.h"
 #include <memory>
 
 namespace arm_compute
@@ -82,7 +82,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -92,7 +97,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -104,7 +112,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index 206f299c06..36358a7094 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index ae4e468f21..98dacde0c1 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NESTACKLAYER_H
-#define ARM_COMPUTE_NESTACKLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -91,9 +91,8 @@ public:
     void run() override;
 
 private:
-    std::vector<ITensor *>                           _input;
-    std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels;
-    unsigned int                                     _num_inputs;
+    std::unique_ptr<NEStackLayerKernel> _stack_kernel;
+    bool                                _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 4b14d946f6..fa1113ffec 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -71,9 +71,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensor *input, ITensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensor     *input,
+                   ITensor           *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -89,9 +94,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -121,9 +131,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -139,9 +154,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 915e5aa1da..001a0a4128 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NETILE_H
 #define ARM_COMPUTE_NETILE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 581fe74309..5d2d1f1b01 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NETRANSPOSE_H
 #define ARM_COMPUTE_NETRANSPOSE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -83,4 +82,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NETRANSPOSE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_NETRANSPOSE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index 079fee5b9e..e1af96d08d 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
 #include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index f9ebf608cb..6caa2aeb59 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -40,13 +35,11 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
-class ICPPKernel;
 
 /** Basic function to simulate a convolution layer. This function calls the following kernels:
  *
- * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
- * -# @ref NEWinogradLayerTransformInputKernel
- * -# @ref NEWinogradLayerTransformOutputKernel
+ * -# @ref cpu::CpuWinogradConv2dTransformInputKernel
+ * -# @ref cpu::CpuWinogradConv2dTransformOutputKernel
  * -# @ref cpu::CpuGemmAssemblyDispatch
  * -# @ref CPPPermute (three times: weights, input and output)
  *
@@ -57,12 +50,16 @@ class NEWinogradConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = delete;
-    /** Default destructor */
-    ~NEWinogradConvolutionLayer() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
+    /** Default move constructor */
+    NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = default;
+    /** Destructor */
+    ~NEWinogradConvolutionLayer();
 
     /** Set the input and output tensors.
      *
@@ -80,7 +77,8 @@ public:
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     *                              Currently only 3x3 and 5x5 kernels are supported.
+     *                              Supported kernel sizes: (height, width) -> 3x3, 1x3, 3x1, 5x5, 1x5, 5x1 for Fp32
+     *                              -> 3x3 for Fp16
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
@@ -89,63 +87,35 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                   bool enable_fast_math = false);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
+    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer
      *
-     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     *                             Currently only 3x3 and 5x5 kernels are supported.
-     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
+     * Similar to @ref NEWinogradConvolutionLayer::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
 private:
-    MemoryGroup                 _memory_group;
-    NEGEMM                      _gemm_function;
-    std::unique_ptr<ICPPKernel> _transform_input_kernel;
-    std::unique_ptr<ICPPKernel> _transform_output_kernel;
-    std::unique_ptr<ICPPKernel> _transform_weights_kernel;
-    NEActivationLayer           _activationlayer_function;
-
-    CPPPermute     _permute_input;
-    CPPPermute     _permute_weights;
-    CPPPermute     _permute_output;
-    Tensor         _input_transformed;
-    Tensor         _output_transformed;
-    Tensor         _input_workspace;
-    Tensor         _output_workspace;
-    Tensor         _kernel_storage;
-    Tensor         _input_nhwc;
-    Tensor         _output_nhwc;
-    Tensor         _weights_hwio;
-    const ITensor *_input;
-    const ITensor *_weights;
-    ITensor       *_output;
-    bool           _is_prepared;
-    bool           _is_activationlayer_enabled;
-    DataLayout     _data_layout;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H