35 files changed, 143 insertions, 5134 deletions
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f508b7ee2e..a02739f339 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -72,8 +72,6 @@ filegroup(
 	"graph/nodes/FlattenLayerNode.cpp",
 	"graph/nodes/FullyConnectedLayer.cpp",
 	"graph/nodes/FusedConvolutionBatchNormalizationNode.cpp",
-	"graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp",
-	"graph/nodes/FusedConvolutionWithPostOpNode.cpp",
 	"graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp",
 	"graph/nodes/GenerateProposalsLayerNode.cpp",
 	"graph/nodes/InputNode.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 76409239ea..39fba860fa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -73,8 +73,6 @@ target_sources(
 	graph/nodes/FlattenLayerNode.cpp
 	graph/nodes/FullyConnectedLayer.cpp
 	graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
-	graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
-	graph/nodes/FusedConvolutionWithPostOpNode.cpp
 	graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
 	graph/nodes/GenerateProposalsLayerNode.cpp
 	graph/nodes/InputNode.cpp
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 03f78697bc..7e56a3ba18 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -23,16 +23,14 @@
  */
 #include "src/core/CL/CLUtils.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "support/StringSupport.h"
 
-#include "src/core/experimental/PostOpUtils.h"
-
 namespace arm_compute
 {
 cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType image_type)
@@ -40,7 +38,7 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
     const cl::Context &ctx    = CLKernelLibrary::get().context();
-    const cl::Buffer  &buffer = tensor->cl_buffer();
+    const cl::Buffer &buffer = tensor->cl_buffer();
     const ITensorInfo *info   = tensor->info();
     ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
                              "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
@@ -113,112 +111,4 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
 
     return cl::Image2D(cl_image);
 }
-
-namespace experimental
-{
-PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config)
-    : _supported_config(supported_config)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed");
-    for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it)
-    {
-        auto post_op_sequence = it->first;
-        auto post_op_slots    = std::get<1>(it->second);
-        ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots");
-    }
-}
-
-bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
-    for(const auto &op : post_ops.get_list())
-    {
-        for(const auto &tensor : op->arguments())
-        {
-            const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape());
-            // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting
-            if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0))
-            {
-                return false;
-            }
-            // NOTE: Kernel limitation: currently only the following broadcasting types are supported:
-            //  1. Post op arg is scalar, broadcast in both first and second dims
-            //  2. Post op arg is of shape: second dim=1, first dim=N, broadcast only in second dim
-            //  This means this case: Post op arg is of shape: second dim=M, first dim=1, broadcast only in first dim, is NOT supported
-            if(dst->dimension(0) > 1 && dst->dimension(1) > 1 && (*tensor)->dimension(0) == 1 && (*tensor)->dimension(1) > 1)
-            {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const
-{
-    if(post_ops.size() == 0)
-    {
-        return true; // Always support cases where no post op is specified
-    }
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-
-    return _supported_config.find(post_op_sequence) != _supported_config.end();
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList<ITensorInfo *> &post_ops) const
-{
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-    const auto slots            = std::get<1>(_supported_config.at(post_op_sequence));
-    for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id)
-    {
-        const auto &post_op     = post_ops.get_list().at(post_op_id);
-        const auto  slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]);
-        if(post_op->type() == experimental::PostOpType::Activation)
-        {
-            const auto _post_op  = utils::cast::polymorphic_downcast<const experimental::PostOpAct<ITensorInfo *> *>(post_op.get());
-            const auto act_type  = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation()));
-            const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a());
-            const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b());
-            build_opts.add_option(act_type);
-            build_opts.add_option(act_a_val);
-            build_opts.add_option(act_b_val);
-        }
-        else if(post_op->type() == experimental::PostOpType::Eltwise_Add)
-        {
-            size_t     arg_id     = 1;
-            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
-            build_opts.add_option(eltwise_op);
-            for(const auto &tensor : post_op->arguments())
-            {
-                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
-                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
-                build_opts.add_option(height);
-                build_opts.add_option(width);
-                ++arg_id;
-            }
-        }
-        else if(post_op->type() == experimental::PostOpType::Eltwise_PRelu)
-        {
-            size_t     arg_id     = 1;
-            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=PRELU" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
-            build_opts.add_option(eltwise_op);
-            for(const auto &tensor : post_op->arguments())
-            {
-                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
-                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
-                build_opts.add_option(height);
-                build_opts.add_option(width);
-                ++arg_id;
-            }
-        }
-    }
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const
-{
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-    const auto postfix          = std::get<0>(_supported_config.at(post_op_sequence));
-    kernel_name += postfix;
-}
-} // namespace experimental
-
 } // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index e3f12d4b53..f0e79bccfc 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -22,11 +22,10 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_CL_CLUTILS_H
-#define ARM_COMPUTE_CL_CLUTILS_H
+#ifndef ACL_SRC_CORE_CL_CLUTILS_H
+#define ACL_SRC_CORE_CL_CLUTILS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 
 #include <map>
 
@@ -74,88 +73,6 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
  * @return cl::Image2D object
  */
 cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+} // namespace arm_compute
 
-namespace experimental
-{
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-
-/** Manage validation, building and configurations of PostOp CL kernels */
-class PostOpCLKernelUtils final
-{
-public:
-    /** CL kernel name postfix for post ops */
-    using NamePostfix = std::string;
-    /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix
-     * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots
-     * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition
-     *
-     * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences.
-     * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel.
-     * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the
-     * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }).
-     * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence.
-     * We can either:
-     *  1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1
-     *  { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or
-     *  2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1
-     *  { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } }
-     */
-    using Slots  = std::vector<unsigned int>;
-    using Config = std::map<PostOpTypeSequence, std::tuple<NamePostfix, Slots>>;
-
-public:
-    explicit PostOpCLKernelUtils(const Config &config);
-
-    /** Check if post op argument tensor shapes are compliant
-     * All post ops must not alter the shape of the original dst tensor (even after broadcasting)
-     *
-     * @param[in] dst      Dst tensor to apply the post ops to
-     * @param[in] post_ops Post ops
-     *
-     * @return true if shapes are compliant and false otherwise
-     */
-    static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops);
-    /** Check if the post op sequence is supported in the current configuration
-     *
-     * @param[in] post_ops Post ops
-     *
-     * @return true if the post op sequence is supported and false otherwise
-     */
-    bool is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const;
-    /** Helper function to set PostOp related build options
-     * @note Convention
-     *      1. Each post op "slot" is prefixed with "P<slot number>", followed by the usual parameters for that post op.
-     *      E.g. If the first slot is an activation, we need to pass 3 definitions in this way:
-     *          -P1_ACTIVATION_TYPE=...  -P1_ACTIVATION_A_VAL=...   -P1_ACTIVATION_B_VAL=...
-     *
-     *      2. For multi-ary post ops, to pass the position of the previous op's dest tensor,
-     *         we append "_X_POS_<pos>" to the post op type.
-     *      E.g. for a single post op add(dst, x), where dst is the result of the main op.
-     *         In this case, the position of the previous op's dest is 0, so we pass
-     *         -P1_ELTWISE_OP=ADD_X_POS_0
-     *
-     * @param[out] built_opts OpenCL kernel build options
-     * @param[in]  post_ops   Post ops
-     *
-     */
-    void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList<ITensorInfo *> &post_ops) const;
-    /** Helper function to set PostOp kernel name
-     *
-     * @param[out] kernel_name OpenCL kernel name
-     * @param[in]  post_ops    Post ops
-     *
-     */
-    void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const;
-
-private:
-    Config _supported_config{};
-};
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
-} // namespace experimental
-
-} // arm_compute
-
-#endif /* ARM_COMPUTE_CL_CLUTILS_H */
+#endif // ACL_SRC_CORE_CL_CLUTILS_H
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
deleted file mode 100644
index 2c2d60ed13..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence:
- * act (optional): POST_OP1_ACTIVATION_OPTIONAL
- * eltwise_op   : POST_OP2_ELTWISE_OP
- * act (optional): POST_OP3_ACTIVATION_OPTIONAL
- */
-
-/** Post Op 1: Activation Block (Optional)
- * @name POST_OP1_ACTIVATION_OPTIONAL
- * Toggled by -DP1_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
-    MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else                                                                                         // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif                                                                                        // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-/** @} */                                                                                     // end of group POST_OP1_ACTIVATION_OPTIONAL
-
-/** Post Op 2: Eltwise Op Block
- * Handles both broadcasting and non-broadcasting cases
- * @name POST_OP2_ELTWISE_OP
- *
- * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] P2_ELTWISE_ARG1_WIDTH  Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] OP                     The elementwise post op
- * @param[in] M0                     The number of consecutive rows
- * @param[in] N0                     The number of consecutive columns
- * @param[in] BASENAME               The basename of the result variables
- * @param[in] ELTWISE_OPERAND_NAME   The basename of the other operand variables
- * @param[in] ELTWISE_OPERAND_ROW    The starting row of the other operand variables. Required as different boundary handling strategies are used by different kernels
- *                                   E.g. reshaped_only_rhs and native kernels shifts rows (by using COMPUTE_M0_START_ROW) to handle boundary rows,
- *                                   whereas reshaped kernels do not shift rows
- * @param[in] DATA_TYPE              Data type of the result variables
- * @param[in] DATA_TYPE_ACCUMULATR   Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] ZERO                   Zero vector for z offset
- * @param[in] PARTIAL_LOAD_M0        The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @param[in] PARTIAL_LOAD_N0        The partial size in x, for partial blocks. Supported: [0, @p N0)
- * @param[in] PARTIAL_COND_Y         Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0.
- * @param[in] PARTIAL_COND_X         Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0.
- * @{
- */
-#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#if P2_ELTWISE_ARG1_HEIGHT == 1
-#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z;              \
-    VEC_DATA_TYPE(DATA_TYPE, 1)                                                                                                                                                                        \
-    ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr);                                                                                                          \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                        \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X);                                                      \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_WIDTH == 1
-#else  // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                                                                                  \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (ELTWISE_OPERAND_ROW * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X);                                                                                        \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif    // P2_ELTWISE_ARG1_HEIGHT == 1
-#endif    // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-/** @} */ // end of group POST_OP2_ELTWISE_OP
-/** Post Op 3: Activation Block (Optional)
- * @name POST_OP3_ACTIVATION_OPTIONAL
- * Toggled by -DP3_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
-    MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else                                                                                         // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif                                                                                        // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-/** @} */                                                                                     // end of group POST_OP3_ACTIVATION_OPTIONAL
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
deleted file mode 100644
index 22ae098772..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h"
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_native, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                     IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                     IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                     IMAGE_DECLARATION(dst),
-                                                     // Post Op arguments
-                                                     IMAGE_DECLARATION(eltwise_operand),
-                                                     uint lhs_stride_z,
-                                                     uint rhs_stride_z,
-#if defined(BETA)
-                                                     uint bias_stride_z,
-#endif //defined(BETA)
-                                                     uint      dst_stride_z,
-                                                     uint      eltwise_operand_stride_z,
-                                                     const int M,
-                                                     const int N,
-                                                     const int K
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                     ,
-                                                     uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                     ,
-                                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                    )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-#if K0 > 1
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
-        RHS_VFMA_M0xN0(0, a, b0, c);
-        RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
-        RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
-        RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
-        RHS_VFMA_M0xN0(4, a, b4, c);
-        RHS_VFMA_M0xN0(5, a, b5, c);
-        RHS_VFMA_M0xN0(6, a, b6, c);
-        RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
-        RHS_VFMA_M0xN0(8, a, b8, c);
-        RHS_VFMA_M0xN0(9, a, b9, c);
-        RHS_VFMA_M0xN0(A, a, bA, c);
-        RHS_VFMA_M0xN0(B, a, bB, c);
-        RHS_VFMA_M0xN0(C, a, bC, c);
-        RHS_VFMA_M0xN0(D, a, bD, c);
-        RHS_VFMA_M0xN0(E, a, bE, c);
-        RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * rhs_stride_y;
-    }
-#endif // K0 > 1
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
-        RHS_VFMA_M0xN0(0, a, b, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
deleted file mode 100644
index 89577e9ebd..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ /dev/null
@@ -1,1424 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-        c += a.s8 * b.s8;   \
-        c += a.s9 * b.s9;   \
-        c += a.sA * b.sA;   \
-        c += a.sB * b.sB;   \
-        c += a.sC * b.sC;   \
-        c += a.sD * b.sD;   \
-        c += a.sE * b.sE;   \
-        c += a.sF * b.sF;   \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else  // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-        c = fma(a.s8, b.s8, c); \
-        c = fma(a.s9, b.s9, c); \
-        c = fma(a.sA, b.sA, c); \
-        c = fma(a.sB, b.sB, c); \
-        c = fma(a.sC, b.sC, c); \
-        c = fma(a.sD, b.sD, c); \
-        c = fma(a.sE, b.sE, c); \
-        c = fma(a.sF, b.sF, c); \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-        ARM_DOT_K0((a), (b##8), (c.s8)); \
-        ARM_DOT_K0((a), (b##9), (c.s9)); \
-        ARM_DOT_K0((a), (b##A), (c.sA)); \
-        ARM_DOT_K0((a), (b##B), (c.sB)); \
-        ARM_DOT_K0((a), (b##C), (c.sC)); \
-        ARM_DOT_K0((a), (b##D), (c.sD)); \
-        ARM_DOT_K0((a), (b##E), (c.sE)); \
-        ARM_DOT_K0((a), (b##F), (c.sF)); \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                    IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                    IMAGE_DECLARATION(dst),
-                                                                    // Post Op arguments
-                                                                    IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint lhs_stride_z,
-                                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                                    uint dst_stride_z,
-                                                                    uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                    ,
-                                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                    ,
-                                                                    const int M,
-                                                                    const int N,
-                                                                    const int K)
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                            __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            // Post Op arguments
-                                                                            IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-#if defined(BETA)
-                                                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                                                            uint dst_stride_z,
-                                                                            uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                            ,
-                                                                            const int M,
-                                                                            const int N,
-                                                                            const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
-    ({                                                 \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
-    })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
-    })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
-    })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
-    })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
-    })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
-    ({                                                        \
-        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
-    })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
-    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
-    (M0, N0, TYPE, A, B, C)
-
-#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                    IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                    IMAGE_DECLARATION(dst),
-                                                                    // Post Op arguments
-                                                                    IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint lhs_stride_z,
-                                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                                    uint dst_stride_z,
-                                                                    uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                    ,
-                                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                    ,
-                                                                    const int M,
-                                                                    const int N,
-                                                                    const int K)
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
-        rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                            __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            // Post Op arguments
-                                                                            IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-#if defined(BETA)
-                                                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                                                            uint dst_stride_z,
-                                                                            uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                            ,
-                                                                            const int M,
-                                                                            const int N,
-                                                                            const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-        x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
-        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#endif // defined(LHS_TRANSPOSE)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
deleted file mode 100644
index 09ddcde043..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ /dev/null
@@ -1,1399 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-#define ARM_DOT2(a, b, c)       \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#define ARM_DOT3(a, b, c)           \
-    ({                              \
-        ARM_DOT2(a, b, c);          \
-        c = fma((a.s2), (b.s2), c); \
-    })
-#define ARM_DOT4(a, b, c)           \
-    ({                              \
-        ARM_DOT3(a, b, c);          \
-        c = fma((a.s3), (b.s3), c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##8), (c.s8));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##9), (c.s9));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##A), (c.sA));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##B), (c.sB));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##C), (c.sC));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##D), (c.sD));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##E), (c.sE));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##F), (c.sF));     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                  IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                  IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                  IMAGE_DECLARATION(dst),
-                                                                  // Post Op arguments
-                                                                  IMAGE_DECLARATION(eltwise_operand),
-                                                                  uint lhs_stride_z,
-                                                                  uint rhs_stride_z,
-#if defined(BETA)
-                                                                  uint bias_stride_z,
-#endif //defined(BETA)
-                                                                  uint dst_stride_z,
-                                                                  uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                  ,
-                                                                  uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                  ,
-                                                                  uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                  ,
-                                                                  const int M,
-                                                                  const int N,
-                                                                  const int K)
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                          __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                          IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                          IMAGE_DECLARATION(dst),
-                                                                          // Post Op arguments
-                                                                          IMAGE_DECLARATION(eltwise_operand),
-                                                                          uint lhs_stride_z,
-                                                                          uint rhs_stride_z,
-#if defined(BETA)
-                                                                          uint bias_stride_z,
-#endif //defined(BETA)
-                                                                          uint dst_stride_z,
-                                                                          uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                          ,
-                                                                          uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                          ,
-                                                                          uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                          ,
-                                                                          const int M,
-                                                                          const int N,
-                                                                          const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    const uint LEFTOVER_K = K % K0;
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    if(LEFTOVER_K != 0)
-    {
-        // Note: We cannot read out-of-bound elements from the RHS matrix because
-        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-        union UNION_VEC_TYPE
-        {
-            DATA_TYPE s[K0];
-            VEC_DATA_TYPE(DATA_TYPE, K0)
-            v;
-        };
-
-        union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
-        union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
-        union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
-        union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
-        union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
-        union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
-        union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
-        union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
-        // Load from RHS matrix
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Load from LHS matrix
-        for(int k = 0; k < LEFTOVER_K; ++k)
-        {
-            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
-            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
-            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
-            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
-            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
-            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
-            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
-            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
-            lhs_offset += sizeof(DATA_TYPE);
-        }
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                   IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                   IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                   IMAGE_DECLARATION(dst),
-                                                                   // Post Op arguments
-                                                                   IMAGE_DECLARATION(eltwise_operand),
-                                                                   uint lhs_stride_z,
-                                                                   uint rhs_stride_z,
-#if defined(BETA)
-                                                                   uint bias_stride_z,
-#endif //defined(BETA)
-                                                                   uint dst_stride_z,
-                                                                   uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                   ,
-                                                                   uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                   ,
-                                                                   uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                   ,
-                                                                   const int M,
-                                                                   const int N,
-                                                                   const int K)
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                           __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                           IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                           IMAGE_DECLARATION(dst),
-                                                                           // Post Op arguments
-                                                                           IMAGE_DECLARATION(eltwise_operand),
-                                                                           uint lhs_stride_z,
-                                                                           uint rhs_stride_z,
-#if defined(BETA)
-                                                                           uint bias_stride_z,
-#endif //defined(BETA)
-                                                                           uint dst_stride_z,
-                                                                           uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                           ,
-                                                                           uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                           ,
-                                                                           uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                           ,
-                                                                           const int M,
-                                                                           const int N,
-                                                                           const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        x_rhs += RHS_STEP_X;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
deleted file mode 100644
index b584251c2a..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */
-
-/** List of (binary) elementwise operators, accounting for the argument position of argument X
- * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place
- * @name elementwise_post_ops
- * @{
- */
-#if defined(N0) && !defined(VEC_SIZE)
-#define VEC_SIZE N0
-#endif // defined(N0) && !defined(VEC_SIZE)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
-#define ADD_X_POS_0(x, y) (x) + (y)
-#define SUB_X_POS_0(x, y) (x) - (y)
-#define MAX_X_POS_0(x, y) max(x, y)
-#define MIN_X_POS_0(x, y) min(x, y)
-#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
-#define POWER_X_POS_0(x, y) pow(x, y)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_0(x, y) (x / y)
-#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-
-#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
-#define SUB_X_POS_1(x, y) (y) - (x)
-#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
-#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER_X_POS_1(x, y) pow(y, x)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_1(x, y) (y / x)
-#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
-#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
-
-// By default use the order of the arguments as they are passed in, ie. _X_POS_0
-#define ADD(x, y) ADD_X_POS_0(x, y)
-#define SUB(x, y) SUB_X_POS_0(x, y)
-#define MAX(x, y) MAX_X_POS_0(x, y)
-#define MIN(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER(x, y) POWER_X_POS_0(x, y)
-#define PRELU(x, y) PRELU_X_POS_0(x, y)
-#define DIV(x, y) DIV_X_POS_0(x, y)
-#define AND(x, y) AND_X_POS_0(x, y)
-#define OR(x, y) OR_X_POS_0(x, y)
-
-#endif    // defined(VEC_SIZE) && defined(DATA_TYPE)
-/** @} */ // end of group elementwise_post_ops
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_ROW_n
- *
- * @param[in]      OP       The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in]      OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
-    OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
-
-#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
-
-#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
-
-#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
-
-#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
-
-#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
-
-#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
-
-#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
-
-#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
-
-#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2)      \
-    OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
-
-#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
-
-#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
-
-#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
-
-#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
-
-#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
-
-#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
-
-/** @} */ // end of group ELTWISE_OP_ROW_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_BLOCK
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP       The elementwise post op
- * @param[in] N        The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_ROW_BROADCAST_n
- *
- * @param[in]      OP       The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in]      OPERAND2 The basename of the broadcast operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
-    OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2)      \
-    OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##A = OP(OPERAND1##A, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##B = OP(OPERAND1##B, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##C = OP(OPERAND1##C, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##D = OP(OPERAND1##D, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##E = OP(OPERAND1##E, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##F = OP(OPERAND1##F, OPERAND2);
-
-/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX];
- *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar);
- *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1];
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP       The elementwise post op
- * @param[in] N        The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
deleted file mode 100644
index e107f4452d..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "gemm_helpers.h"
-#include "load_store_utility.h"
-
-/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations
- * -DMIXED_PRECISION toggles mixed precision mode
- */
-
-/** Mixed-Precision-Aware Activation Block
- * @name MIXED_PRECISION_ACTIVATION_BLOCK
- * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK
- *
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
-    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
-    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK
- *
- * @param[in] OP                   The elementwise post op
- * @param[in] M0                   The number of consecutive rows
- * @param[in] N0                   The number of consecutive columns
- * @param[in] OPERAND1             The basename of the first and result operand variables
- * @param[in] OPERAND2             The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                     \
-    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Broadcast Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1
- *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1
- *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1
- *
- * @param[in] OP                   The elementwise post op
- * @param[in] M0                   The number of consecutive rows, > 1
- * @param[in] N0                   The number of consecutive columns, >= 1
- * @param[in] OPERAND1             The basename of the first and result operand variables
- * @param[in] OPERAND2             The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                                \
-    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
-
-/** Mixed-Precision-Aware Boundary-Aware Store Block
- * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
- * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE
- *
- * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP);                                                                                                                       \
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index a32301d8e3..0c30c0e626 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -152,7 +152,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -453,7 +452,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -887,7 +885,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -1213,7 +1210,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -1713,7 +1709,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
@@ -1993,7 +1988,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -2380,7 +2374,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -2767,7 +2760,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
@@ -3226,7 +3218,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS matrix is NOT reshaped
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
diff --git a/src/core/experimental/PostOpUtils.h b/src/core/experimental/PostOpUtils.h
deleted file mode 100644
index 6217dcc3da..0000000000
--- a/src/core/experimental/PostOpUtils.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
-
-#include "arm_compute/core/experimental/Types.h"
-#include "support/Cast.h"
-
-#include <vector>
-
-/** (EXPERIMENTAL_POST_OPS) */
-namespace arm_compute
-{
-namespace experimental
-{
-/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */
-template <typename FromTensorT, typename ToTensorT>
-PostOpList<ToTensorT> transform_post_op_list_arguments(const PostOpList<FromTensorT> &post_ops, std::function<ToTensorT(FromTensorT)> transform_arg)
-{
-    PostOpList<ToTensorT> transformed_post_ops;
-    for(const auto &post_op : post_ops.get_list())
-    {
-        switch(post_op->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpAct<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpAct<ToTensorT>>(_post_op->_act_info);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwiseAdd<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpEltwiseAdd<ToTensorT>>(transform_arg(_post_op->_addend), _post_op->_prev_dst_pos, _post_op->_policy);
-                break;
-            }
-            case PostOpType::Eltwise_PRelu:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwisePRelu<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpEltwisePRelu<ToTensorT>>(transform_arg(_post_op->_alpha_param), _post_op->_prev_dst_pos, _post_op->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-    return transformed_post_ops;
-}
-
-/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */
-inline TensorType get_post_op_arg_type(size_t index)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int>(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range");
-    return static_cast<TensorType>(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast<int>(index));
-}
-
-/** Get a sequence of PostOp Types from PostOpList */
-template <typename T>
-PostOpTypeSequence get_post_op_sequence(const PostOpList<T> &post_ops)
-{
-    PostOpTypeSequence post_op_sequence;
-    for(const auto &op : post_ops.get_list())
-    {
-        post_op_sequence.push_back(op->type());
-    }
-    return post_op_sequence;
-}
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index d11e4f0b24..39b410d609 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -107,7 +107,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
     // Create GEMMInfo structure
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
     // Supported activations in GEMM
     const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
@@ -156,8 +156,8 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info,
-                                                                              experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format));
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
+                                                                              weight_format));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
         for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -188,7 +188,7 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
     // Create GEMMInfo structure
     const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
     if(is_quantized)
     {
@@ -422,7 +422,7 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
     const GEMMInfo     gemm_info     = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                                 gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weights_info.weight_format());
+                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
 
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index de2e9f9742..e4a3d30b6d 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -275,23 +275,14 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "gemm_mm_native", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
     { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
     { "gemm_lc_vm_f32", "common/gemm.cl" },
     { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
     { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
@@ -623,26 +614,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/common/gemm_utils.clembed"
     },
     {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.hembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.hembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
-    },
-    {
         "common/gemmlowp.cl",
 #include "./cl_kernels/common/gemmlowp.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index 5fea097ae3..b8997dfc7f 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -52,25 +51,6 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
@@ -90,7 +70,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -133,7 +112,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -240,7 +218,6 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -298,20 +275,11 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_native");
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -396,11 +364,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7;
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6;
         }
         const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -412,11 +380,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
         }
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -440,12 +408,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
             add_2D_tensor_argument(idx, src2, slice);
         }
         add_2D_tensor_argument(idx, dst, slice);
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
+
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         if(_add_bias)
@@ -453,12 +416,6 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
 
         // Pass m, n and k at runtime
         _kernel.setArg<cl_int>(idx++, _m);
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index e478df727a..80f8355932 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/common/Macros.h"
@@ -76,17 +76,16 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_input_as_3d{ false };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{ true };
+    bool       _reinterpret_input_as_3d{ false };
+    bool       _reinterpret_output_as_3d{ false };
+    bool       _use_dummy_work_items{ false };
+    bool       _add_bias{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index f14a6f1900..d72d29ea1e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -53,25 +52,6 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
@@ -95,7 +75,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -139,7 +118,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -202,7 +180,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_src0 = src0->num_dimensions();
@@ -260,23 +237,14 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
     kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -395,13 +363,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
 
@@ -417,12 +378,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
         if(_reinterpret_output_as_3d)
         {
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 2d668b91a3..8d25412a40 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
@@ -100,17 +100,16 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{ true };
+    bool       _reinterpret_output_as_3d{ false };
+    bool       _use_dummy_work_items{ false };
+    bool       _add_bias{ false };
+    bool       _export_to_cl_image{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
-\ No newline at end of file
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index f780538f53..b34c17cda8 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -23,13 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -47,25 +46,6 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
 {
@@ -86,7 +66,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -132,7 +111,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -203,7 +181,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _has_pad_y                = gemm_info.has_pad_y;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     auto padding_info = get_padding_info({ src0, src1, src2, dst });
 
@@ -270,22 +247,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
         build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
         build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     }
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -411,13 +380,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
 
@@ -432,12 +394,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
 
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
         if(_reinterpret_input_as_3d && _has_pad_y)
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 00cdb299ce..471160c94b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
@@ -90,19 +90,18 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_input_as_3d{ false };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    bool         _has_pad_y{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{ true };
+    bool       _reinterpret_input_as_3d{ false };
+    bool       _reinterpret_output_as_3d{ false };
+    bool       _use_dummy_work_items{ false };
+    bool       _add_bias{ false };
+    bool       _export_to_cl_image{ false };
+    bool       _has_pad_y{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index 51248d4a7a..eb9475ccaa 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
         case ConvolutionMethod::WINOGRAD:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClWinogradConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
             _operator = std::move(f);
@@ -99,7 +98,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
         case ConvolutionMethod::DIRECT:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClDirectConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
             _operator = std::move(f);
@@ -108,7 +106,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
         case ConvolutionMethod::INDIRECT:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClIndirectConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
             _operator = std::move(f);
@@ -142,7 +139,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
         {
             //Validate Winograd
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClWinogradConv2d does not support PostOps");
             ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
             break;
         }
@@ -150,7 +146,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
         {
             // Validate direct convolution layer
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClDirectConv2d does not support PostOps");
             ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
@@ -158,7 +153,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
         {
             // Validate indirect convolution layer
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClIndirectConv2d does not support PostOps");
             ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
@@ -271,17 +265,17 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
             if(is_data_type_float(src->data_type()))
             {
                 // Get dst shape
-                TensorShape output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-                const bool  is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool  is_ifm_ge_8        = src->dimension(idx_c) >= 8;
-                const bool  is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool  is_ofm_lte_8       = weights->dimension(3U) <= 8;
-                const bool  is_ofm_lt_64       = weights->dimension(3U) < 64;
-                const bool  workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
-                const bool  is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-                const bool  is_m_one           = output_shape[1] * output_shape[2] == 1;
-                const bool  is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
-                const int32_t kernel_sz        = weights->dimension(idx_w) * weights->dimension(idx_h);
+                TensorShape   output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool    is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool    is_ifm_ge_8        = src->dimension(idx_c) >= 8;
+                const bool    is_ifm_ge_16       = src->dimension(idx_c) >= 16;
+                const bool    is_ofm_lte_8       = weights->dimension(3U) <= 8;
+                const bool    is_ofm_lt_64       = weights->dimension(3U) < 64;
+                const bool    workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool    is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
+                const bool    is_m_one           = output_shape[1] * output_shape[2] == 1;
+                const bool    is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz          = weights->dimension(idx_w) * weights->dimension(idx_h);
 
                 // Run Winograd if valid and IFM >= 8
                 if(is_wino_valid && is_ifm_ge_8)
@@ -330,7 +324,7 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                         {
                             const bool is_kernel_sz_odd = kernel_sz % 2;
                             const bool is_g77           = gpu_target == GPUTarget::G77;
-                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+                            preferred_conv_method       = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
                         }
 
                         // Direct/indirect convolution used for the first layer of the network
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 8db6dabe58..7e331a86f3 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -38,7 +38,6 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -222,7 +221,6 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
@@ -254,7 +252,6 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
     kernel_info.reinterpret_input_as_3d = false;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _reshape_lhs_kernel->set_target(gpu_target);
@@ -299,7 +296,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
@@ -346,7 +342,6 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
@@ -396,7 +391,6 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
 
@@ -433,7 +427,6 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
     kernel_info.reinterpret_input_as_3d = false;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -482,7 +475,6 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -531,7 +523,6 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -624,7 +615,12 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     // Select GEMMType
     CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
     {
-        CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
+        CLScheduler::get().target(),
+        a->data_type(),
+        m,
+        n,
+        k,
+        batch_size,
     },
     gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
 
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 682477e4ea..5620471ff9 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,14 +54,14 @@ namespace opencl
 {
 ClGemmConv2d::ClGemmConv2d()
     : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _use_post_ops(false), _aux_mem(AuxTensorIdx::Count)
+      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
 {
 }
 ClGemmConv2d::~ClGemmConv2d() = default;
 
 void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                int gemm_3d_depth, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
     ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
@@ -76,14 +76,12 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
                                          false,                 // fast_math
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
-                                         act_info,              // activation_info
-                                         post_ops               // post ops
+                                         act_info               // activation_info
                                         );
 
     TensorInfo tmp_src{ *src };
     if(_is_quantized)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
@@ -118,7 +116,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
 }
 
 Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
 
@@ -132,13 +130,11 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
                                          false,                 // fast_math
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
-                                         act_info,              // activation_info
-                                         post_ops               // post ops
+                                         act_info               // activation_info
                                         );
 
     if(is_quantized)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
@@ -189,19 +185,18 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
 
     // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
     _fuse_activation = true;
-    _use_post_ops    = conv2d_info.post_ops.size() > 0;
 
     const ITensorInfo *gemm_input_to_use  = src;
     ITensorInfo       *gemm_output_to_use = dst;
 
     // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
+    unsigned int stride_x        = 0;
+    unsigned int stride_y        = 0;
     std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
 
     // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
+    unsigned int conv_w      = 0;
+    unsigned int conv_h      = 0;
     std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
                                                  src->dimension(idx_height),
                                                  kernel_width,
@@ -318,11 +313,10 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info, conv2d_info.post_ops);
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
 
     if(!_skip_col2im)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClGemmConv2d does not support post ops with col2im operation"); // Post ops must be performed after every other op
         // Set the GPU target for col2im
         _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
         _col2im_kernel->set_target(CLScheduler::get().target());
@@ -334,8 +328,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    // Disable running of activation kernel if post ops are used
-    if(!_fuse_activation && !_use_post_ops)
+    if(!_fuse_activation)
     {
         _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
         _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
@@ -383,15 +376,11 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
     const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
                                              && conv2d_info.conv_info.stride().second == 1);
-    const bool skip_col2im     = data_layout == DataLayout::NHWC;
-    bool       fuse_activation = true;
-    bool       use_post_ops    = conv2d_info.post_ops.size() > 0;
+    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
+    bool               fuse_activation    = true;
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!skip_im2col
-                                    && conv2d_info.post_ops.size() > 0,
-                                    "ClGemmConv2d does not support post ops with col2im or im2col operation"); // Post ops must be performed after every other op
 
     // Validate biases
     if(biases != nullptr)
@@ -520,8 +509,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info,
-                                            conv2d_info.post_ops));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
 
     // Validate Col2Im
     if(!skip_col2im)
@@ -530,8 +518,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     }
 
     // Validate Activation Layer
-    // Disable running (thus validation) of activation kernel if post ops are used
-    if(!fuse_activation && !use_post_ops)
+    if(!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
     }
@@ -600,8 +587,7 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
-    // Disable running of activation kernel if post ops are used
-    if(!_fuse_activation && !_use_post_ops)
+    if(!_fuse_activation)
     {
         ITensorPack pack =
         {
@@ -620,7 +606,7 @@ void ClGemmConv2d::prepare(ITensorPack &tensors)
         ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
         CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
         auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack =
+        ITensorPack        pack    =
         {
             { TensorType::ACL_SRC, weights },
             { TensorType::ACL_DST, weights_reshaped.get() }
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index afde7c511d..8a46ee2dc3 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
-#define ARM_COMPUTE_CL_GEMM_CONV2D_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -113,8 +112,8 @@ public:
                            const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -133,7 +132,7 @@ private:
      */
     void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
                       const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -150,7 +149,7 @@ private:
      * @return a status
      */
     static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
 
     enum AuxTensorIdx
     {
@@ -178,10 +177,9 @@ private:
     bool _fuse_activation;
     bool _append_bias;
     bool _is_prepared;
-    bool _use_post_ops;
 
     experimental::MemoryRequirements _aux_mem;
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 85d24b4654..073ffd413d 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -131,14 +131,6 @@ void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
     add_convolution_layer_method<FusedConvolutionBatchNormalizationNode>(_layer_data, n);
 }
 
-void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    _layer_data.clear();
-    add_generic_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-    add_convolution_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-    add_convolution_layer_method<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-}
-
 void DataLayerVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     _layer_data.clear();
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index e5b4adda26..70fe44e134 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018,2021 Arm Limited.
+ * Copyright (c) 2018,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@ namespace graph
 INode::INode()
     : _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
       _outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
-      ,_post_op_info_list(std::list<std::unique_ptr<ConvPostOpInfo>> {})
 {
 }
 // clang-format on
@@ -200,15 +199,5 @@ Target INode::assigned_target() const
 {
     return _assigned_target;
 }
-
-const std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list() const
-{
-    return _post_op_info_list;
-}
-
-std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list()
-{
-    return _post_op_info_list;
-}
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index f067d618bd..5369f6f539 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,14 +85,6 @@ void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
 {
     default_visit(n);
 }
-void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    default_visit(n);
-}
-void DefaultNodeVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
-    default_visit(n);
-}
 void DefaultNodeVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     default_visit(n);
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c67f6a538b..882810474e 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -274,8 +274,6 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
             return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
-        case NodeType::FusedConvolutionWithPostOp:
-            return detail::create_fused_convolution_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
             return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
@@ -318,8 +316,6 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
             return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
-        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
-            return detail::create_fused_convolution_batch_normalization_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationWithPostOpsNode *>(node), ctx);
         default:
             return nullptr;
     }
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index c50782db48..8fd8c14f63 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,8 +76,6 @@ Status CLNodeValidator::validate(INode *node)
                    CLDirectConvolutionLayer,
                    CLGEMMConvolutionLayer,
                    CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
-        case NodeType::FusedConvolutionWithPostOp:
-            return detail::validate_fused_convolution_with_post_op<CLGEMMConvolutionLayer>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node));
         case NodeType::DepthToSpaceLayer:
             return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 8eb3e4cb71..38284b93cf 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -29,8 +29,6 @@
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
 #include "src/graph/mutators/MutatorUtils.h"
@@ -333,441 +331,6 @@ void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse
     }
 }
 
-/** Check valid combinations:
- *
- * | Main operator | Post operators             |
- * |:--------------|:---------------------------|
- * |conv           | add                        |
- * |conv           | act + add                  |
- * |conv           | add + act                  |
- * |conv           | act + add + act            |
- *
-*/
-#define MAX_VALIDE_COMBINATION 4
-#define MAX_POST_OP_NUM 3
-NodeType valide_post_op_type[MAX_VALIDE_COMBINATION][MAX_POST_OP_NUM] = { { EltwiseLayerNode::node_type },
-    { EltwiseLayerNode::node_type, ActivationLayerNode::node_type },
-    { ActivationLayerNode::node_type, EltwiseLayerNode::node_type },
-    { ActivationLayerNode::node_type, EltwiseLayerNode::node_type, ActivationLayerNode::node_type }
-};
-
-bool check_post_op_type(NodeType *post_op_type, int len)
-{
-    if(len > MAX_POST_OP_NUM || len <= 0)
-    {
-        return false;
-    }
-
-    bool found = false;
-    for(int i = 0; i < MAX_VALIDE_COMBINATION; ++i)
-    {
-        for(int j = 0; j < len; ++j)
-        {
-            if(post_op_type[j] != valide_post_op_type[i][j])
-            {
-                found = false;
-                break;
-            }
-            found = true;
-        }
-        if(found)
-            break;
-    }
-
-    return found;
-}
-
-void fuse_convolution_with_post_op(Graph &g, INode *fused_node, std::list<INode *> post_op_node_list, int prev_op_dst_pos)
-{
-    unsigned int op_idx = 0;
-    // Fuse post operators with conv
-    for(const auto &post_op : post_op_node_list)
-    {
-        switch(post_op->type())
-        {
-            case EltwiseLayerNode::node_type:
-            {
-                auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op);
-                ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-
-                fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoEltwiseAdd>(prev_op_dst_pos, eltwise_node->convert_policy()));
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Elementwise Layer node with ID : " << post_op->id());
-                break;
-            }
-            case ActivationLayerNode::node_type:
-            {
-                auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op);
-                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-
-                fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoActivation>(act_node->activation_info()));
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Activation Layer node with ID : " << post_op->id());
-                break;
-            }
-            default:
-            {
-                break;
-            }
-        }
-
-        if(op_idx == post_op_node_list.size() - 1) // last fusable node
-        {
-            transfer_driving_nodes_and_remove_old_node(g, fused_node, post_op, true);
-        }
-        else
-        {
-            // Remove node
-            g.remove_node(post_op->id());
-        }
-        op_idx++;
-    }
-}
-
-std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    std::list<INode *> post_op_node_list    = {};
-    NodeID             prev_op_dst_id       = conv_node_id;
-    NodeType           post_op_type_list[3] = { NodeType::Dummy, NodeType::Dummy, NodeType::Dummy };
-    int                post_op_idx          = 0;
-
-    // Get list of the connected nodes
-    auto current_node = g.node(conv_node_id);
-
-    while(post_op_node_list.size() < 3)
-    {
-        // This convolution node must have only one output edge, otherwise this function would not have been called
-
-        auto current_output_edge_id = current_node->output_edges().begin();
-        auto current_output_edge    = g.edge(*current_output_edge_id);
-        auto post_op_node           = current_output_edge->consumer();
-
-        bool fusable_post_op = false;
-        if(post_op_node != nullptr && post_op_node->output_edges().size() > 0)
-        {
-            switch(post_op_node->type())
-            {
-                case EltwiseLayerNode::node_type:
-                {
-                    auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_node);
-                    ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-                    if(eltwise_node->output(0)->accessor() == nullptr)
-                    {
-                        post_op_node_list.push_back(post_op_node);
-                        fusable_post_op                  = true;
-                        post_op_type_list[post_op_idx++] = eltwise_node->type();
-
-                        // Extract elementwise inputs
-                        const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
-                        const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
-                        if(eltwise_input_id_0 == prev_op_dst_id)
-                        {
-                            eltwise_operand_id = eltwise_input_id_1;
-                            prev_op_dst_pos    = 0;
-                        }
-                        else if(eltwise_input_id_1 == prev_op_dst_id)
-                        {
-                            eltwise_operand_id = eltwise_input_id_0;
-                            prev_op_dst_pos    = 1;
-                        }
-                    }
-                    else
-                    {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
-                    }
-                    break;
-                }
-                case ActivationLayerNode::node_type:
-                {
-                    auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_node);
-                    ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-                    // Check if activation is supported for fusion
-                    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
-                    {
-                        break;
-                    }
-                    if(act_node->output(0)->accessor() == nullptr)
-                    {
-                        post_op_node_list.push_back(post_op_node);
-                        fusable_post_op                  = true;
-                        post_op_type_list[post_op_idx++] = act_node->type();
-                        prev_op_dst_id                   = act_node->id();
-                    }
-                    else
-                    {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-                    }
-                    break;
-                }
-                default:
-                {
-                    break;
-                }
-            }
-
-            // Check if the node is not a branching node and current node is fusable
-            if(post_op_node->output_edges().size() == 1 && fusable_post_op == true)
-            {
-                current_node = post_op_node;
-            }
-            else
-            {
-                break;
-            }
-        }
-    }
-
-    // Check whether it's valid post op list
-    if(post_op_node_list.size() > 0)
-    {
-        bool fuse_with_post_op = check_post_op_type(post_op_type_list, post_op_node_list.size());
-        if(!fuse_with_post_op)
-        {
-            post_op_node_list.clear();
-        }
-    }
-
-    return post_op_node_list;
-}
-
-/** Fuse below operators:
- *
- * | Main operator | Post operators             |
- * |:--------------|:---------------------------|
- * |conv           | add                        |
- * |conv           | act + add                  |
- * |conv           | add + act                  |
- * |conv           | act + add + act            |
- *
- * Notes: currently, only GEMM supports fusion with post operator
-*/
-void fuse_convolution_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-
-    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
-    if(conv_algorithm != ConvolutionMethod::GEMM)
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-        return;
-    }
-
-    // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
-    {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
-        const Edge *input_edge = conv_node->input_edge(1);
-        if(input_edge != nullptr && input_edge->tensor() != nullptr)
-        {
-            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
-            const DataType    data_type    = input_edge->tensor()->desc().data_type;
-            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
-            {
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-                return;
-            }
-        }
-        else
-        {
-            return;
-        }
-
-        // Get post op list
-        int                eltwise_operand_id = 0;
-        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
-        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
-        if(post_op_node_list.size() == 0)
-        {
-            return;
-        }
-        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
-        {
-            const Target assigned_target = conv_node->assigned_target();
-
-            // Extract conv inputs
-            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
-            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
-            const auto   conv_info       = conv_node->convolution_info();
-            const auto   conv_method     = conv_node->convolution_method();
-            const auto   num_groups      = conv_node->num_groups();
-            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
-
-            // Create the fused node
-            const NodeID fused_id = g.add_node<FusedConvolutionWithPostOpNode>(conv_info, num_groups, conv_method, fast_math_hint);
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << conv_node->id());
-
-            // Add connections from the conv inputs to the fused node
-            g.add_connection(conv_input_id, 0, fused_id, 0);
-            g.add_connection(conv_weights_id, 0, fused_id, 1);
-            if(conv_node->input_edge(2) != nullptr)
-            {
-                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
-                g.add_connection(conv_bias_id, 0, fused_id, 2);
-            }
-            // Adding the Element wise operand in case the post op is element wise operation
-            auto it = std::find_if(post_op_node_list.begin(),
-                                   post_op_node_list.end(),
-                                   [&](const INode * nd)
-            {
-                return (nd->type() == graph::NodeType::EltwiseLayer);
-            });
-
-            if(it != post_op_node_list.end())
-            {
-                g.add_connection(eltwise_operand_id, 0, fused_id, 3);
-            }
-            g.remove_node(conv_node->id());
-
-            // Update fused node outputs
-            auto fused_node = g.node(fused_id);
-            fused_node->set_assigned_target(assigned_target);
-
-            // Fuse convolution with post op
-            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
-            post_op_node_list.clear();
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-    }
-}
-
-void fuse_convolution_batch_normalization_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(output_edge->producer());
-    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
-    if(conv_algorithm != ConvolutionMethod::GEMM)
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-        return;
-    }
-
-    // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
-    {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
-        const Edge *input_edge = conv_node->input_edge(1);
-        if(input_edge != nullptr && input_edge->tensor() != nullptr)
-        {
-            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
-            const DataType    data_type    = input_edge->tensor()->desc().data_type;
-            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
-            {
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-                return;
-            }
-        }
-        else
-        {
-            return;
-        }
-
-        // Get post op list
-        int                eltwise_operand_id = 0;
-        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
-        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
-        if(post_op_node_list.size() == 0)
-        {
-            return;
-        }
-        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
-        {
-            const Target assigned_target = conv_node->assigned_target();
-
-            // Extract conv inputs
-            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
-            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
-            const auto   bn_mean_id      = conv_node->input_edge(3)->producer_id();
-            const auto   bn_var_id       = conv_node->input_edge(4)->producer_id();
-            const auto   conv_info       = conv_node->convolution_info();
-            const auto   conv_method     = conv_node->convolution_method();
-            const auto   num_groups      = conv_node->num_groups();
-            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
-
-            // Create the fused node
-
-            const float  epsilon  = conv_node->epsilon();
-            const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationWithPostOpsNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint);
-
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing FusedConvolutionBatchNormalization node with ID : " << conv_node->id());
-
-            // Add connections from the conv inputs to the fused node
-            g.add_connection(conv_input_id, 0, fused_id, 0);
-            g.add_connection(conv_weights_id, 0, fused_id, 1);
-
-            if(conv_node->input_edge(2) != nullptr)
-            {
-                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
-                g.add_connection(conv_bias_id, 0, fused_id, 2);
-            }
-            g.add_connection(bn_mean_id, 0, fused_id, 3);
-            g.add_connection(bn_var_id, 0, fused_id, 4);
-
-            // Move connections of old FusedConvolutionBatchNormalization to the fused node
-            if(conv_node->input_edge(5) != nullptr)
-            {
-                const auto bn_beta_id = conv_node->input_edge(5)->producer_id();
-                g.add_connection(bn_beta_id, 0, fused_id, 5);
-            }
-
-            if(conv_node->input_edge(6) != nullptr)
-            {
-                const auto bn_gamma_id = conv_node->input_edge(6)->producer_id();
-                g.add_connection(bn_gamma_id, 0, fused_id, 6);
-            }
-
-            // Adding the Element wise operand in case the post op is element wise operation
-            auto it = std::find_if(post_op_node_list.begin(),
-                                   post_op_node_list.end(),
-                                   [&](const INode * nd)
-            {
-                return (nd->type() == graph::NodeType::EltwiseLayer);
-            });
-
-            if(it != post_op_node_list.end())
-            {
-                g.add_connection(eltwise_operand_id, 0, fused_id, 7);
-            }
-
-            // Update fused node outputs
-            auto fused_node = g.node(fused_id);
-            fused_node->set_assigned_target(assigned_target);
-
-            auto conv_node_name = conv_node->name();
-
-            // collect the post ops names
-            std::string post_ops_name = "";
-            for(auto &post_op : post_op_node_list)
-            {
-                post_ops_name += post_op->name();
-            }
-            fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + post_ops_name, assigned_target });
-
-            // Fuse convolution with post op
-            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
-            post_op_node_list.clear();
-            g.remove_node(conv_node->id());
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-    }
-}
-
 template <typename N1, typename F, typename... Args>
 void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
 {
@@ -839,10 +402,6 @@ void NodeFusionMutator::mutate(Graph &g)
 
     detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
     detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
-    // The fusion of PostOps to ConvolutionLayer:
-    // It must occur after the fusion of PadLayer into ConvolutionLayer
-    // It must occur before the fusion of normal ActivationLayer into ConvolutionLayer as it takes precedence
-    detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution_with_post_ops, supported_fused_activations);
     detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
     detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
@@ -851,7 +410,6 @@ void NodeFusionMutator::mutate(Graph &g)
     // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
     detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
-    detail::fuse_layer<FusedConvolutionBatchNormalizationNode>(g, cl_target_prec, detail::fuse_convolution_batch_normalization_with_post_ops, supported_fused_activations);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
deleted file mode 100644
index af81f0369a..0000000000
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionBatchNormalizationWithPostOpsNode::FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
-                                                                                                     unsigned int      num_groups,
-                                                                                                     ConvolutionMethod method,
-                                                                                                     FastMathHint      fast_math_hint)
-    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint)
-{
-    _input_edges.resize(8, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_convolution_method(ConvolutionMethod method)
-{
-    _method = method;
-}
-
-float FusedConvolutionBatchNormalizationWithPostOpsNode::epsilon() const
-{
-    return _epsilon;
-}
-
-ConvolutionMethod FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_method() const
-{
-    return _method;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_fast_math_hint(FastMathHint hint)
-{
-    _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionBatchNormalizationWithPostOpsNode::fast_math_hint() const
-{
-    return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_info() const
-{
-    return _info;
-}
-
-unsigned int FusedConvolutionBatchNormalizationWithPostOpsNode::num_groups() const
-{
-    return _num_groups;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                              const TensorDescriptor &weights_descriptor,
-                                                                                              const PadStrideInfo    &info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
-    const DataLayout data_layout       = input_descriptor.layout;
-    TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
-    return output_descriptor;
-}
-
-bool FusedConvolutionBatchNormalizationWithPostOpsNode::forward_descriptors()
-{
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    const Tensor *src     = input(0);
-    const Tensor *weights = input(1);
-
-    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-
-    return output_info;
-}
-
-NodeType FusedConvolutionBatchNormalizationWithPostOpsNode::type() const
-{
-    return FusedConvolutionBatchNormalizationWithPostOpsNode::node_type;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp b/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
deleted file mode 100644
index 63341e2760..0000000000
--- a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionWithPostOpNode::FusedConvolutionWithPostOpNode(PadStrideInfo     info,
-                                                               unsigned int      num_groups,
-                                                               ConvolutionMethod method,
-                                                               FastMathHint      fast_math_hint,
-                                                               QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
-{
-    _input_edges.resize(4, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_method(ConvolutionMethod method)
-{
-    _method = method;
-}
-
-ConvolutionMethod FusedConvolutionWithPostOpNode::convolution_method() const
-{
-    return _method;
-}
-
-void FusedConvolutionWithPostOpNode::set_fast_math_hint(FastMathHint hint)
-{
-    _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionWithPostOpNode::fast_math_hint() const
-{
-    return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionWithPostOpNode::convolution_info() const
-{
-    return _info;
-}
-
-unsigned int FusedConvolutionWithPostOpNode::num_groups() const
-{
-    return _num_groups;
-}
-
-ActivationLayerInfo FusedConvolutionWithPostOpNode::fused_activation() const
-{
-    return _fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_fused_activation(ActivationLayerInfo fused_activation)
-{
-    _fused_activation = fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_info(PadStrideInfo info)
-{
-    _info = info;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                           const TensorDescriptor &weights_descriptor,
-                                                                           const PadStrideInfo    &info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
-    const DataLayout data_layout       = input_descriptor.layout;
-    TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
-    return output_descriptor;
-}
-
-bool FusedConvolutionWithPostOpNode::forward_descriptors()
-{
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    const Tensor *src     = input(0);
-    const Tensor *weights = input(1);
-
-    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-    if(!_out_quant_info.empty())
-    {
-        output_info.quant_info = _out_quant_info;
-    }
-
-    return output_info;
-}
-
-NodeType FusedConvolutionWithPostOpNode::type() const
-{
-    return FusedConvolutionWithPostOpNode::node_type;
-}
-
-void FusedConvolutionWithPostOpNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 1071d50197..9c7c4248bb 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,22 +85,6 @@ void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
     _info = ss.str();
 }
 
-void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    ARM_COMPUTE_UNUSED(n);
-    std::stringstream ss;
-    ss << "FusedConvolutionBatchNormalizationWithPostOpsNode";
-    _info = ss.str();
-}
-
-void DotGraphVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
-    ARM_COMPUTE_UNUSED(n);
-    std::stringstream ss;
-    ss << "FusedConvolutionWithPostOpNode";
-    _info = ss.str();
-}
-
 void DotGraphVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     ARM_COMPUTE_UNUSED(n);
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 476bf27423..f3c05adb47 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -29,7 +29,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConv2d.h"
 
@@ -61,26 +60,21 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 CLConvolutionLayer::~CLConvolutionLayer() = default;
 
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 }
 
 void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                                    const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
                                                             enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
-    // Convert post op arguments to ITensorInfo
-    auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
-    {
-        return tensor->info();
-    });
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, transformed_post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
                                                     weights_info, CLScheduler::get().target()))
@@ -97,7 +91,6 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
         }
         case ConvolutionMethod::FFT:
         {
-            ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
             auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
             f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _impl->func = std::move(f);
@@ -110,31 +103,23 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
 
     if(_impl->op)
     {
-        _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
-        _impl->aux_mem_req          = _impl->op->workspace();
-        _impl->run_pack             = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        size_t post_op_tensor_index = 0;
-        for(const auto &op : post_ops.get_list())
-        {
-            for(auto &tensor : op->arguments())
-            {
-                _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
-            }
-        }
-        _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->aux_mem_req  = _impl->op->workspace();
+        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
+        _impl->workspace    = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget  gpu_target  = CLScheduler::get().target();
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
@@ -149,7 +134,6 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
             ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
             break;
         }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ad5bfd8dd2..c8c18f35db 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
@@ -69,24 +68,19 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
 CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
 
 void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups, post_ops);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
 }
 
 void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                        const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    _impl->weights = weights;
-    _impl->op      = std::make_unique<opencl::ClGemmConv2d>();
-    // Convert post op arguments to ITensorInfo
-    auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
-    {
-        return tensor->info();
-    });
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, transformed_post_ops);
+    _impl->weights               = weights;
+    _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
 
     _impl->run_pack =
@@ -96,15 +90,6 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
         { TensorType::ACL_SRC_2, biases },
         { TensorType::ACL_DST, output }
     };
-    // Add post op tensors
-    size_t post_op_tensor_index = 0;
-    for(const auto &op : post_ops.get_list())
-    {
-        for(auto &tensor : op->arguments())
-        {
-            _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
-        }
-    }
     _impl->prep_pack =
     {
         { TensorType::ACL_SRC_1, weights },
@@ -115,9 +100,9 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
 }