From 1af5416917268692fcd4b34b1d7ffebd3a2aea8a Mon Sep 17 00:00:00 2001
From: SiCongLi <sicong.li@arm.com>
Date: Wed, 6 Oct 2021 15:25:57 +0100
Subject: Add experimental PostOp interface to
 ClGemmMatrixMultiplyReshapedKernel Part 1

This interface supports the fusion of multiple elementwise operations

Partially resolves: COMPMID-4435

Change-Id: If68dd7dd98dcf239fde7cb1f0a4a6d4d1e899a6f
Signed-off-by: SiCongLi <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6483
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |    4 +
 SConscript                                         |    6 +-
 arm_compute/core/KernelDescriptors.h               |   69 +-
 arm_compute/core/Types.h                           |    6 +-
 arm_compute/core/experimental/IPostOp.h            |  162 +++
 arm_compute/core/experimental/Types.h              |    5 +
 src/core/CL/CLUtils.cpp                            |   97 +-
 src/core/CL/CLUtils.h                              |   86 +-
 .../fp_post_ops_act_eltwise_op_act.h               |  101 ++
 .../act_eltwise_op_act/gemm_mm_reshaped.cl         | 1404 ++++++++++++++++++++
 .../fp_elementwise_op_helpers.h                    |  262 ++++
 .../fp_mixed_precision_helpers.h                   |  113 ++
 src/core/CL/cl_kernels/common/gemm.cl              |    6 +-
 src/core/experimental/PostOp.h                     |  171 +++
 src/gpu/cl/ClKernelLibrary.cpp                     |    8 +
 .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp |   42 +-
 .../kernels/ClGemmMatrixMultiplyReshapedKernel.h   |    1 +
 .../CL/functions/CLBatchNormalizationLayer.cpp     |    3 +-
 tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp |  354 ++++-
 tests/validation/fixtures/GEMMFixture.h            |  261 ++++
 tests/validation/reference/PostOps.cpp             |   76 ++
 tests/validation/reference/PostOps.h               |   47 +
 utils/TypePrinter.h                                |  163 ++-
 23 files changed, 3391 insertions(+), 56 deletions(-)
 create mode 100644 arm_compute/core/experimental/IPostOp.h
 create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
 create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
 create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
 create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
 create mode 100644 src/core/experimental/PostOp.h
 create mode 100644 tests/validation/reference/PostOps.cpp
 create mode 100644 tests/validation/reference/PostOps.h

diff --git a/Android.bp b/Android.bp
index adcafa65d5..4ec0475605 100644
--- a/Android.bp
+++ b/Android.bp
@@ -27,6 +27,10 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/common/elementwise_operation.cl",
         "src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl",
         "src/core/CL/cl_kernels/common/elementwise_unary.cl",
+        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
+        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
+        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h",
+        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
         "src/core/CL/cl_kernels/common/fft.cl",
         "src/core/CL/cl_kernels/common/fft_digit_reverse.cl",
         "src/core/CL/cl_kernels/common/fft_scale.cl",
diff --git a/SConscript b/SConscript
index 6c5839122a..468d7388cd 100644
--- a/SConscript
+++ b/SConscript
@@ -277,7 +277,10 @@ if env['opencl'] and env['embed_kernels']:
                         'src/core/CL/cl_kernels/repeat.h',
                         'src/core/CL/cl_kernels/tile_helpers.h',
                         'src/core/CL/cl_kernels/types.h',
-                        'src/core/CL/cl_kernels/warp_helpers.h'
+                        'src/core/CL/cl_kernels/warp_helpers.h',
+                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h',
+                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h',
+                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h',
                     ]
 
     # Common kernels
@@ -307,6 +310,7 @@ if env['opencl'] and env['embed_kernels']:
                        'src/core/CL/cl_kernels/common/floor.cl',
                        'src/core/CL/cl_kernels/common/gather.cl',
                        'src/core/CL/cl_kernels/common/gemm.cl',
+                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
                        'src/core/CL/cl_kernels/common/gemv.cl',
                        'src/core/CL/cl_kernels/common/gemmlowp.cl',
                        'src/core/CL/cl_kernels/common/generate_proposals.cl',
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index a6e5c3372e..bc6ec1f6c5 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/IPostOp.h"
 
 namespace arm_compute
 {
@@ -52,48 +53,52 @@ struct FFTRadixStageKernelInfo
     bool         is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
 };
 
+class ITensorInfo;
 /** Descriptor used by the GEMM kernels */
 struct GEMMKernelInfo
 {
     GEMMKernelInfo() = default;
     GEMMKernelInfo(
-        unsigned int        im,
-        unsigned int        in,
-        unsigned int        ik,
-        unsigned int        idepth_output_gemm3d,
-        bool                ireinterpret_input_as_3d,
-        bool                ibroadcast_bias,
-        bool                ifp_mixed_precision,
-        bool                ihas_pad_y,
-        ActivationLayerInfo iactivation_info,
-        int                 inmult_transpose1xW_width,
-        int                 imult_interleave4x4_height,
-        GEMMLHSMatrixInfo   ilhs_info,
-        GEMMRHSMatrixInfo   irhs_info,
-        int32_t             ina_offset,
-        int32_t             inb_offset)
+        unsigned int                                   im,
+        unsigned int                                   in,
+        unsigned int                                   ik,
+        unsigned int                                   idepth_output_gemm3d,
+        bool                                           ireinterpret_input_as_3d,
+        bool                                           ibroadcast_bias,
+        bool                                           ifp_mixed_precision,
+        bool                                           ihas_pad_y,
+        ActivationLayerInfo                            iactivation_info,
+        int                                            inmult_transpose1xW_width,
+        int                                            imult_interleave4x4_height,
+        GEMMLHSMatrixInfo                              ilhs_info,
+        GEMMRHSMatrixInfo                              irhs_info,
+        int32_t                                        ina_offset,
+        int32_t                                        inb_offset,
+        const experimental::PostOpList<ITensorInfo *> &ipost_ops = experimental::PostOpList<ITensorInfo *> {})
         : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
           has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info),
-          rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset)
+          rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops)
     {
     }
 
-    unsigned int            m{ 0 };                           /**< Number of LHS rows*/
-    unsigned int            n{ 0 };                           /**< Number of RHS columns*/
-    unsigned int            k{ 0 };                           /**< Number of LHS columns or RHS rows */
-    unsigned int            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
-    bool                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
-    bool                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
-    bool                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
-    bool                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
-    ActivationLayerInfo     activation_info{};                /**< Activation function to perform after the matrix multiplication */
-    int                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
-    int                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
-    GEMMLHSMatrixInfo       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
-    GEMMRHSMatrixInfo       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
-    int32_t                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
-    int32_t                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
-    GEMMLowpOutputStageInfo output_stage{};                   /**< GEMMLowp output stage information */
+    unsigned int                            m{ 0 };                           /**< Number of LHS rows*/
+    unsigned int                            n{ 0 };                           /**< Number of RHS columns*/
+    unsigned int                            k{ 0 };                           /**< Number of LHS columns or RHS rows */
+    unsigned int                            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
+    bool                                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
+    bool                                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
+    bool                                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool                                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+    ActivationLayerInfo                     activation_info{};                /**< Activation function to perform after the matrix multiplication */
+    int                                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
+    int                                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
+    GEMMLHSMatrixInfo                       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+    GEMMRHSMatrixInfo                       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
+    int32_t                                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
+    int32_t                                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
+    GEMMLowpOutputStageInfo                 output_stage{};                   /**< GEMMLowp output stage information */
+    experimental::PostOpList<ITensorInfo *> post_ops{};                       /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed.
+                                                          *   If specified, automatically disable the @ref activation_info */
 };
 
 /** Compute descriptor used by the depthwise convolution native kernel */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index b2b09825c1..bfe85ea937 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -382,7 +382,11 @@ struct BorderSize
 /** Container for 2D padding size */
 using PaddingSize = BorderSize;
 
-/** Policy to handle overflow */
+/** Policy to handle integer overflow
+ *  @note: This is ignored by floating point operations where the overflow behavior adheres to the IEEE-754 standard
+ *         which states that in case of overflow ±infinity is returned for the round-to-nearest modes (and follows the
+ *         rounding rules for the directed rounding modes) by default.
+ */
 enum class ConvertPolicy
 {
     WRAP,    /**< Wrap around */
diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h
new file mode 100644
index 0000000000..cd6b8fc4cc
--- /dev/null
+++ b/arm_compute/core/experimental/IPostOp.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
+#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Type of Post Op */
+enum class PostOpType
+{
+    Activation,
+    Eltwise_Add,
+};
+/** An ordered sequence of type of Post Ops */
+using PostOpTypeSequence = std::vector<PostOpType>;
+/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators
+ *  It contains:
+ *      1. The attributes of the original operator.
+ *      2. Any additional tensor argument.
+ *      3. The postion of the previous op's dst tensor in its argument list ( @ref prev_dst_pos )
+ *
+ *  For example, a series of chained ops:
+ *
+ *          div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info)
+ *
+ *      translates to
+ *
+ *          dst = conv(src0, weights, bias, conv_info)  // main op
+ *          dst = relu(dst, act_info)                   // previous dst is placed in the first (and only) argument
+ *          dst = div(src1, dst, div_info)              // previous dst is placed in the second argument
+ *
+ *      which in turn translates to:
+ *
+ *          main op: conv(src0, weights, bias, conv_info)
+ *          post op1: relu(act_info, prev_dst_pos = 0)
+ *          post op2: div(div_info, src1, prev_dst_pos = 1)
+ *
+ *  NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type
+ *  NOTE: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime
+ *        and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time
+ *        after the ITensor or ITensorInfo objects are already constructed
+ */
+template <typename TensorRelatedT>
+struct IPostOp
+{
+    /** Get the arity of the post op
+     * NOTE: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst
+     *       tensor as one of the arguments
+     */
+    size_t arity() const
+    {
+        return arguments().size();
+    }
+    /** The position of previous op's dst in current op's argument list */
+    virtual int prev_dst_pos() const = 0;
+    /** The IPostOp type */
+    virtual PostOpType type() const = 0;
+    /** The argument tensors
+     * The order of the argument tensor is strictly preserved
+     */
+    virtual std::vector<TensorRelatedT *>       arguments()       = 0;
+    virtual std::vector<const TensorRelatedT *> arguments() const = 0;
+    /** Clone method used in cases where PostOps are owned by unique_ptr
+     * NOTE: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource
+     */
+    virtual std::unique_ptr<IPostOp<TensorRelatedT>> clone() const = 0;
+    virtual ~IPostOp()
+    {
+    }
+};
+
+/** A sequence of PostOps that can be appended to the end of other operators */
+template <typename TensorRelatedT>
+class PostOpList
+{
+public:
+    /** Constructor */
+    PostOpList() = default;
+    /** Destructor */
+    ~PostOpList() = default;
+    PostOpList(const PostOpList &other)
+    {
+        for(const auto &op : other._post_ops)
+        {
+            this->_post_ops.push_back(op->clone());
+        }
+    }
+    PostOpList &operator=(const PostOpList &other)
+    {
+        PostOpList tmp{ other };
+        std::swap(tmp, *this);
+        return *this;
+    }
+    PostOpList(PostOpList &&other) = default;
+    PostOpList &operator=(PostOpList &&other) = default;
+
+    /** Add a new post op at the end of the list */
+    template <typename OpT, typename... Args>
+    void push_back_op(Args &&... args)
+    {
+        _post_ops.push_back(std::make_unique<OpT>(std::forward<Args>(args)...));
+    }
+
+    /** Number of post ops */
+    size_t size() const
+    {
+        return _post_ops.size();
+    }
+
+    /** Total number of post ops */
+    size_t total_num_arguments() const
+    {
+        return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2)
+        {
+            return op1_arity + op2->arity();
+        });
+    }
+
+    /** Get the underlying post op list */
+    std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list()
+    {
+        return _post_ops;
+    }
+    const std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() const
+    {
+        return _post_ops;
+    }
+
+private:
+    std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> _post_ops{};
+};
+
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
\ No newline at end of file
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index a478513b1b..c8755dc26c 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -76,6 +76,11 @@ enum TensorType : int32_t
     ACL_VEC_COL_SUM = ACL_SRC_4,
     ACL_SHIFTS      = ACL_SRC_5,
     ACL_MULTIPLIERS = ACL_SRC_6,
+
+    // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else
+    EXPERIMENTAL_ACL_POST_OP_ARG       = 2048,
+    EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG,
+    EXPERIMENTAL_ACL_POST_OP_ARG_LAST  = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments
 };
 
 namespace experimental
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 67af240044..1da970e705 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/StringSupport.h"
 
 #include "src/core/CL/CLUtils.h"
+#include "src/core/experimental/PostOp.h"
 
-cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
+namespace arm_compute
+{
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
 {
     cl_channel_type cl_data_type;
 
@@ -62,3 +68,90 @@ cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, cons
 
     return cl::Image2D(cl_image);
 }
+
+namespace experimental
+{
+PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config)
+    : _supported_config(supported_config)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed");
+    for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it)
+    {
+        auto post_op_sequence = it->first;
+        auto post_op_slots    = std::get<1>(it->second);
+        ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots");
+    }
+}
+
+bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops)
+{
+    // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting
+    for(const auto &op : post_ops.get_list())
+    {
+        for(const auto &tensor : op->arguments())
+        {
+            const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape());
+            if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const
+{
+    if(post_ops.size() == 0)
+    {
+        return true; // Always support cases where no post op is specified
+    }
+    const auto post_op_sequence = get_post_op_sequence(post_ops);
+
+    return _supported_config.find(post_op_sequence) != _supported_config.end();
+}
+
+void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList<ITensorInfo *> &post_ops) const
+{
+    const auto post_op_sequence = get_post_op_sequence(post_ops);
+    const auto slots            = std::get<1>(_supported_config.at(post_op_sequence));
+    for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id)
+    {
+        const auto &post_op     = post_ops.get_list().at(post_op_id);
+        const auto  slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]);
+        if(post_op->type() == experimental::PostOpType::Activation)
+        {
+            const auto _post_op  = utils::cast::polymorphic_downcast<const experimental::PostOpAct<ITensorInfo *> *>(post_op.get());
+            const auto act_type  = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation()));
+            const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a());
+            const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b());
+            build_opts.add_option(act_type);
+            build_opts.add_option(act_a_val);
+            build_opts.add_option(act_b_val);
+        }
+        else if(post_op->type() == experimental::PostOpType::Eltwise_Add)
+        {
+            size_t     arg_id     = 1;
+            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
+            build_opts.add_option(eltwise_op);
+            for(const auto &tensor : post_op->arguments())
+            {
+                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
+                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
+                build_opts.add_option(height);
+                build_opts.add_option(width);
+                ++arg_id;
+            }
+        }
+    }
+}
+
+void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const
+{
+    const auto post_op_sequence = get_post_op_sequence(post_ops);
+    const auto postfix          = std::get<0>(_supported_config.at(post_op_sequence));
+    kernel_name += postfix;
+}
+} // namespace experimental
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index b65d547756..d133e4fe6f 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,13 @@
 #define ARM_COMPUTE_CL_CLUTILS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/IPostOp.h"
 
 namespace arm_compute
 {
 class TensorShape;
+class CLBuildOptions;
+class ITensorInfo;
 
 /** Create a cl::Image2D object from an OpenCL buffer
  *
@@ -51,6 +54,87 @@ class TensorShape;
  */
 cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch);
 
+namespace experimental
+{
+/** @name (EXPERIMENTAL_POST_OPS)
+ * @{
+ */
+
+/** Manage validation, building and configurations of PostOp CL kernels */
+class PostOpCLKernelUtils final
+{
+public:
+    /** CL kernel name postfix for post ops */
+    using NamePostfix = std::string;
+    /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix
+     * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots
+     * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition
+     *
+     * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences.
+     * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel.
+     * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the
+     * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }).
+     * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence.
+     * We can either:
+     *  1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1
+     *  { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or
+     *  2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1
+     *  { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } }
+     */
+    using Slots  = std::vector<unsigned int>;
+    using Config = std::map<PostOpTypeSequence, std::tuple<NamePostfix, Slots>>;
+
+public:
+    explicit PostOpCLKernelUtils(const Config &config);
+
+    /** Check if post op argument tensor shapes are compliant
+     * All post ops must not alter the shape of the original dst tensor (even after broadcasting)
+     *
+     * @param[in] dst      Dst tensor to apply the post ops to
+     * @param[in] post_ops Post ops
+     *
+     * @return true if shapes are compliant and false otherwise
+     */
+    static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops);
+    /** Check if the post op sequence is supported in the current configuration
+     *
+     * @param[in] post_ops Post ops
+     *
+     * @return true if the post op sequence is supported and false otherwise
+     */
+    bool is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const;
+    /** Helper function to set PostOp related build options
+     * @note Convention
+     *      1. Each post op "slot" is prefixed with "P<slot number>", followed by the usual parameters for that post op.
+     *      E.g. If the first slot is an activation, we need to pass 3 definitions in this way:
+     *          -P1_ACTIVATION_TYPE=...  -P1_ACTIVATION_A_VAL=...   -P1_ACTIVATION_B_VAL=...
+     *
+     *      2. For multi-ary post ops, to pass the position of the previous op's dest tensor,
+     *         we append "_X_POS_<pos>" to the post op type.
+     *      E.g. for a single post op add(dst, x), where dst is the result of the main op.
+     *         In this case, the position of the previous op's dest is 0, so we pass
+     *         -P1_ELTWISE_OP=ADD_X_POS_0
+     *
+     * @param[out] built_opts OpenCL kernel build options
+     * @param[in]  post_ops   Post ops
+     *
+     */
+    void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList<ITensorInfo *> &post_ops) const;
+    /** Helper function to set PostOp kernel name
+     *
+     * @param[out] kernel_name OpenCL kernel name
+     * @param[in]  post_ops    Post ops
+     *
+     */
+    void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const;
+
+private:
+    Config _supported_config{};
+};
+/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
+
+} // namespace experimental
+
 } // arm_compute
 
 #endif /* ARM_COMPUTE_CL_CLUTILS_H */
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
new file mode 100644
index 0000000000..fc9704f13b
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "fp_mixed_precision_helpers.h"
+
+/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence:
+ * act (optional): POST_OP1_ACTIVATION_OPTIONAL
+ * eltwise_op   : POST_OP2_ELTWISE_OP
+ * act (optional): POST_OP3_ACTIVATION_OPTIONAL
+ */
+
+/** Post Op 1: Activation Block (Optional)
+ * @name POST_OP1_ACTIVATION_OPTIONAL
+ * Toggled by -DP1_ACTIVATION_TYPE
+ * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
+ * @{
+ */
+#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
+#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
+    MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
+#else                                                                                         // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
+#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
+#endif                                                                                        // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
+/** @} */                                                                                     // end of group POST_OP1_ACTIVATION_OPTIONAL
+
+/** Post Op 2: Eltwise Op Block
+ * Handles both broadcasting and non-broadcasting cases
+ * @name POST_OP2_ELTWISE_OP
+ *
+ * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor
+ * @param[in] P2_ELTWISE_ARG1_WIDTH  Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor
+ * @param[in] OP                     The elementwise post op
+ * @param[in] M0                     The number of consecutive rows
+ * @param[in] N0                     The number of consecutive columns
+ * @param[in] BASENAME               The basename of the result variables
+ * @param[in] ELTWISE_OPERAND_NAME   The basename of the other operand variables
+ * @param[in] DATA_TYPE              Data type of the result variables
+ * @param[in] DATA_TYPE_ACCUMULATR   Higher-precision accumulator data type in case of mixed-precision op
+ * @param[in] ZERO                   Zero vector for z offset
+ * @param[in] PARTIAL_LOAD_M0        The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @param[in] PARTIAL_LOAD_N0        The partial size in x, for partial blocks. Supported: [0, @p N0)
+ * @param[in] PARTIAL_COND_Y         Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X         Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0.
+ * @{
+ */
+#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+#if P2_ELTWISE_ARG1_HEIGHT == 1
+#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1]
+#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)         \
+    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
+    VEC_DATA_TYPE(DATA_TYPE, 1)                                                                                                                                                           \
+    ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr);                                                                                             \
+    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
+#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0]
+#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                                             \
+    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X);                                                      \
+    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
+#endif // P2_ELTWISE_ARG1_WIDTH == 1
+#else  // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0]
+#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                                                                                                               \
+    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X);                                                                                                \
+    MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
+#endif    // P2_ELTWISE_ARG1_HEIGHT == 1
+#endif    // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+/** @} */ // end of group POST_OP2_ELTWISE_OP
+
+/** Post Op 3: Activation Block (Optional)
+ * @name POST_OP3_ACTIVATION_OPTIONAL
+ * Toggled by -DP3_ACTIVATION_TYPE
+ * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
+ * @{
+ */
+#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
+#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
+    MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
+#else                                                                                         // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
+#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
+#endif                                                                                        // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
+/** @} */                                                                                     // end of group POST_OP3_ACTIVATION_OPTIONAL
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
new file mode 100644
index 0000000000..9404c5e6db
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
@@ -0,0 +1,1404 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "fp_post_ops_act_eltwise_op_act.h"
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+        c += a.s8 * b.s8;   \
+        c += a.s9 * b.s9;   \
+        c += a.sA * b.sA;   \
+        c += a.sB * b.sB;   \
+        c += a.sC * b.sC;   \
+        c += a.sD * b.sD;   \
+        c += a.sE * b.sE;   \
+        c += a.sF * b.sF;   \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+        c = fma(a.s8, b.s8, c); \
+        c = fma(a.s9, b.s9, c); \
+        c = fma(a.sA, b.sA, c); \
+        c = fma(a.sB, b.sB, c); \
+        c = fma(a.sC, b.sC, c); \
+        c = fma(a.sD, b.sD, c); \
+        c = fma(a.sE, b.sE, c); \
+        c = fma(a.sF, b.sF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if defined(ARM_DOT_K0XN0)
+#undef ARM_DOT_K0XN0
+#endif // defined(ARM_DOT_K0XN0)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+                                                                    IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                                    IMAGE_DECLARATION(dst),
+                                                                    // Post-Op arguments
+                                                                    IMAGE_DECLARATION(eltwise_operand),
+                                                                    uint k,
+                                                                    uint lhs_stride_z,
+                                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                                    uint dst_stride_z,
+                                                                    uint eltwise_operand_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                                    ,
+                                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                                   )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    // c = act(c)
+    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+    // c = act(c)
+    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+    // Store output block
+    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+                                                                            __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                                            IMAGE_DECLARATION(dst),
+                                                                            // Post-Op arguments
+                                                                            IMAGE_DECLARATION(eltwise_operand),
+                                                                            uint k,
+                                                                            uint lhs_stride_z,
+                                                                            uint rhs_stride_z,
+#if defined(BETA)
+                                                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                                                            uint dst_stride_z,
+                                                                            uint eltwise_operand_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                                            ,
+                                                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                                           )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    // c = act(c)
+    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+    // c = act(c)
+    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+    // Store output block
+    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
+    ({                                                 \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
+    })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+    })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+    })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+    })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+    })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
+// a is the column-vector (transposed)
+// b is the row-vector (not transposed)
+// C is the output matrix
+// Lower case is a vector (a, b)
+// Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
+    ({                                                        \
+        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+    })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
+    (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+                                                                    IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                                    IMAGE_DECLARATION(dst),
+                                                                    // Post-Op arguments
+                                                                    IMAGE_DECLARATION(eltwise_operand),
+                                                                    uint k,
+                                                                    uint lhs_stride_z,
+                                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                                    uint dst_stride_z,
+                                                                    uint eltwise_operand_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                                    ,
+                                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                                   )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+    for(int i = 0; i < k; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+        rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    // c = act(c)
+    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+    // c = act(c)
+    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+    // Store output block
+    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+                                                                            __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                                            IMAGE_DECLARATION(dst),
+                                                                            // Post-Op arguments
+                                                                            IMAGE_DECLARATION(eltwise_operand),
+                                                                            uint k,
+                                                                            uint lhs_stride_z,
+                                                                            uint rhs_stride_z,
+#if defined(BETA)
+                                                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                                                            uint dst_stride_z,
+                                                                            uint eltwise_operand_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                                            ,
+                                                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                                           )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+        x_rhs += K0 * RHS_STEP_X;
+#ifndef RHS_INTERLEAVE
+        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    // c = act(c)
+    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+    // c = act(c)
+    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+    // Store output block
+    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
+#endif // defined(LHS_TRANSPOSE)
+#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
new file mode 100644
index 0000000000..9ddf51a13c
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */
+
+/** List of (binary) elementwise operators, accounting for the argument position of argument X
+ * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place
+ * @name elementwise_post_ops
+ * @{
+ */
+#if defined(N0) && !defined(VEC_SIZE)
+#define VEC_SIZE N0
+#endif // defined(N0) && !defined(VEC_SIZE)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
+#define ADD_X_POS_0(x, y) (x) + (y)
+#define SUB_X_POS_0(x, y) (x) - (y)
+#define MAX_X_POS_0(x, y) max(x, y)
+#define MIN_X_POS_0(x, y) min(x, y)
+#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
+#define POWER_X_POS_0(x, y) pow(x, y)
+#if VEC_SIZE == 1
+#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
+#else // VEC_SIZE == 1
+#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
+#endif // VEC_SIZE == 1
+#define DIV_X_POS_0(x, y) (x / y)
+#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
+#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
+
+#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
+#define SUB_X_POS_1(x, y) (y) - (x)
+#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
+#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
+#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
+#define POWER_X_POS_1(x, y) pow(y, x)
+#if VEC_SIZE == 1
+#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
+#else // VEC_SIZE == 1
+#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
+#endif // VEC_SIZE == 1
+#define DIV_X_POS_1(x, y) (y / x)
+#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
+#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
+
+// By default use the order of the arguments as they are passed in, ie. _X_POS_0
+#define ADD(x, y) ADD_X_POS_0(x, y)
+#define SUB(x, y) SUB_X_POS_0(x, y)
+#define MAX(x, y) MAX_X_POS_0(x, y)
+#define MIN(x, y) MIN_X_POS_0(x, y)
+#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
+#define POWER(x, y) POWER_X_POS_0(x, y)
+#define PRELU(x, y) PRELU_X_POS_0(x, y)
+#define DIV(x, y) DIV_X_POS_0(x, y)
+#define AND(x, y) AND_X_POS_0(x, y)
+#define OR(x, y) OR_X_POS_0(x, y)
+
+#endif    // defined(VEC_SIZE) && defined(DATA_TYPE)
+/** @} */ // end of group elementwise_post_ops
+
+/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
+ * @name ELTWISE_OP_ROW_n
+ *
+ * @param[in]      OP       The elementwise post op
+ * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
+ * @param[in]      OPERAND2 The basename of the operand 2 variables
+ * @{
+ */
+#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
+    OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
+
+#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
+
+#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
+
+#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
+
+#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
+
+#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
+
+#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
+
+#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
+
+#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
+
+#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2)      \
+    OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
+
+#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
+
+#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
+
+#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
+
+#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
+
+#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
+
+#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
+
+/** @} */ // end of group ELTWISE_OP_ROW_n
+
+/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
+ * @name ELTWISE_OP_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] OP       The elementwise post op
+ * @param[in] N        The number of vectors in the block
+ * @param[in] OPERAND1 The basename of the destination and operand 1 variables
+ * @param[in] OPERAND2 The basename of the operand 2 variables
+ * @{
+ */
+#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
+#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
+/** @} */ // end of group ELTWISE_OP_BLOCK
+
+/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
+ * @name ELTWISE_OP_ROW_BROADCAST_n
+ *
+ * @param[in]      OP       The elementwise post op
+ * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
+ * @param[in]      OPERAND2 The basename of the broadcast operand 2 variables
+ * @{
+ */
+#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
+    OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2)      \
+    OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##A = OP(OPERAND1##A, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##B = OP(OPERAND1##B, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##C = OP(OPERAND1##C, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##D = OP(OPERAND1##D, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##E = OP(OPERAND1##E, OPERAND2);
+
+#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
+    ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2)     \
+    OPERAND1##F = OP(OPERAND1##F, OPERAND2);
+
+/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n
+
+/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
+ * @name ELTWISE_OP_BLOCK_BROADCAST
+ * @note Only support:
+ *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX];
+ *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar);
+ *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1];
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] OP       The elementwise post op
+ * @param[in] N        The number of vectors in the block
+ * @param[in] OPERAND1 The basename of the destination and operand 1 variables
+ * @param[in] OPERAND2 The basename of the operand 2 variables
+ * @{
+ */
+#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
+#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
+/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
new file mode 100644
index 0000000000..e8df121142
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "fp_elementwise_op_helpers.h"
+#include "gemm_helpers.h"
+#include "load_store_utility.h"
+
+/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations
+ * -DMIXED_PRECISION toggles mixed precision mode
+ */
+
+/** Mixed-Precision-Aware Activation Block
+ * @name MIXED_PRECISION_ACTIVATION_BLOCK
+ * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK
+ *
+ * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
+ * @{
+ */
+#if defined(MIXED_PRECISION)
+#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
+    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
+#else // defined(MIXED_PRECISION)
+#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
+    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
+#endif    // defined(MIXED_PRECISION)
+/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK
+
+/** Mixed-Precision-Aware Elementwise Op Block
+ * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
+ * @name MIXED_PRECISION_ELTWISE_OP_BLOCK
+ *
+ * @param[in] OP                   The elementwise post op
+ * @param[in] M0                   The number of consecutive rows
+ * @param[in] N0                   The number of consecutive columns
+ * @param[in] OPERAND1             The basename of the first and result operand variables
+ * @param[in] OPERAND2             The basename of the second operand variables
+ * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
+ * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
+ * @{
+ */
+#if defined(MIXED_PRECISION)
+#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                     \
+    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
+#else // defined(MIXED_PRECISION)
+#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
+    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
+#endif    // defined(MIXED_PRECISION)
+/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK
+
+/** Mixed-Precision-Aware Elementwise Op Broadcast Block
+ * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
+ * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
+ * @note Only support:
+ *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1
+ *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1
+ *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1
+ *
+ * @param[in] OP                   The elementwise post op
+ * @param[in] M0                   The number of consecutive rows, > 1
+ * @param[in] N0                   The number of consecutive columns, >= 1
+ * @param[in] OPERAND1             The basename of the first and result operand variables
+ * @param[in] OPERAND2             The basename of the second operand variables
+ * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
+ * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
+ * @{
+ */
+#if defined(MIXED_PRECISION)
+#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                                \
+    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
+#else // defined(MIXED_PRECISION)
+#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
+    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
+#endif    // defined(MIXED_PRECISION)
+/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
+
+/** Mixed-Precision-Aware Boundary-Aware Store Block
+ * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
+ * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE
+ *
+ * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op
+ * @{
+ */
+#if defined(MIXED_PRECISION)
+#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP);                                                                                                                       \
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
+#else // defined(MIXED_PRECISION)
+#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
+#endif    // defined(MIXED_PRECISION)
+/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index 431c97becc..dd03147ad6 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -2524,6 +2524,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
@@ -2798,6 +2799,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -3179,6 +3181,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -3560,6 +3563,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
@@ -3929,7 +3933,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 
 #endif // defined(LHS_TRANSPOSE)
 
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
 
 #if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
 
diff --git a/src/core/experimental/PostOp.h b/src/core/experimental/PostOp.h
new file mode 100644
index 0000000000..64414d2050
--- /dev/null
+++ b/src/core/experimental/PostOp.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOP
+#define ARM_COMPUTE_EXPERIMENTAL_POSTOP
+
+#include "arm_compute/core/experimental/IPostOp.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "support/Cast.h"
+
+#include <vector>
+
+/** (EXPERIMENTAL_POST_OPS) */
+namespace arm_compute
+{
+namespace experimental
+{
+template <typename TensorRelatedT>
+struct PostOpAct : public IPostOp<TensorRelatedT>
+{
+public:
+    PostOpAct(const ActivationLayerInfo &act_info)
+        : _act_info{ act_info }
+    {
+    }
+    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
+    ~PostOpAct() override        = default;
+    PostOpAct(const PostOpAct &) = default;
+    PostOpAct &operator=(const PostOpAct &) = default;
+    PostOpAct(PostOpAct &&)                 = default;
+    PostOpAct &operator=(PostOpAct &&) = default;
+
+    int prev_dst_pos() const override
+    {
+        return 0;
+    }
+    PostOpType type() const override
+    {
+        return PostOpType::Activation;
+    }
+    std::vector<TensorRelatedT *> arguments() override
+    {
+        return {};
+    }
+    std::vector<const TensorRelatedT *> arguments() const override
+    {
+        return {};
+    }
+    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
+    {
+        return std::make_unique<PostOpAct<TensorRelatedT>>(*this);
+    }
+    ActivationLayerInfo _act_info;
+};
+
+template <typename TensorRelatedT>
+struct PostOpEltwiseAdd : public IPostOp<TensorRelatedT>
+{
+public:
+    PostOpEltwiseAdd(TensorRelatedT addend, int prev_op_arg_pos, ConvertPolicy policy)
+        : _addend{ addend },
+          _prev_op_arg_pos{ prev_op_arg_pos },
+          _policy{ policy }
+    {
+    }
+    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
+    ~PostOpEltwiseAdd() override               = default;
+    PostOpEltwiseAdd(const PostOpEltwiseAdd &) = default;
+    PostOpEltwiseAdd &operator=(const PostOpEltwiseAdd &) = default;
+    PostOpEltwiseAdd(PostOpEltwiseAdd &&)                 = default;
+    PostOpEltwiseAdd &operator=(PostOpEltwiseAdd &&) = default;
+    int               prev_dst_pos() const override
+    {
+        return _prev_op_arg_pos;
+    }
+    PostOpType type() const override
+    {
+        return PostOpType::Eltwise_Add;
+    }
+    std::vector<TensorRelatedT *> arguments() override
+    {
+        return { &_addend };
+    }
+    std::vector<const TensorRelatedT *> arguments() const override
+    {
+        return { &_addend };
+    }
+    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
+    {
+        return std::make_unique<PostOpEltwiseAdd<TensorRelatedT>>(*this);
+    }
+    TensorRelatedT _addend;
+    int            _prev_op_arg_pos;
+    ConvertPolicy  _policy;
+};
+
+/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */
+template <typename FromTensorT, typename ToTensorT>
+PostOpList<ToTensorT> transform_post_op_list_arguments(const PostOpList<FromTensorT> &post_ops, std::function<ToTensorT(FromTensorT)> transform_arg)
+{
+    PostOpList<ToTensorT> transformed_post_ops;
+    int                   op_idx = 0;
+    for(const auto &post_op : post_ops.get_list())
+    {
+        switch(post_op->type())
+        {
+            case PostOpType::Activation:
+            {
+                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpAct<FromTensorT> *>(post_op.get());
+                transformed_post_ops.template push_back_op<PostOpAct<ToTensorT>>(_post_op->_act_info);
+                break;
+            }
+            case PostOpType::Eltwise_Add:
+            {
+                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwiseAdd<FromTensorT> *>(post_op.get());
+                transformed_post_ops.template push_back_op<PostOpEltwiseAdd<ToTensorT>>(transform_arg(_post_op->_addend), _post_op->_prev_op_arg_pos, _post_op->_policy);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported PostOpType");
+            }
+        }
+        ++op_idx;
+    }
+    return transformed_post_ops;
+}
+
+/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */
+inline TensorType get_post_op_arg_type(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int>(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range");
+    return static_cast<TensorType>(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast<int>(index));
+}
+
+template <typename T>
+PostOpTypeSequence get_post_op_sequence(const PostOpList<T> &post_ops)
+{
+    PostOpTypeSequence post_op_sequence;
+    for(const auto &op : post_ops.get_list())
+    {
+        post_op_sequence.push_back(op->type());
+    }
+    return post_op_sequence;
+}
+
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOP
\ No newline at end of file
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 9d524f936e..cbc4caf5f6 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -276,6 +276,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
+    { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
+    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
+    { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
+    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
     { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
@@ -579,6 +583,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
     {
         "common/gemm.cl",
 #include "./cl_kernels/common/gemm.clembed"
+    },
+    {
+        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
+#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
     },
     {
         "common/gemmlowp.cl",
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index 3a39128c0a..4b28e2badc 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -34,6 +34,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/experimental/PostOp.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -51,6 +52,16 @@ namespace
 {
 using ElementsProcessed = Steps;
 
+const auto post_op_utils = experimental::PostOpCLKernelUtils(
+{
+    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
+    { {}, { "", {} } },
+    { { experimental::PostOpType::Activation }, { "", { 1 } } },
+    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
+    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
+    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
+    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
+});
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
@@ -74,6 +85,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -117,6 +129,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -180,6 +193,7 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _k                        = gemm_info.k;
+    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_src0 = src0->num_dimensions();
@@ -222,9 +236,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
     build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
@@ -240,11 +251,23 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    // If post_ops are used, then we disable the use of gemm_info.activation_info
+    if(gemm_info.post_ops.size() > 0)
+    {
+        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
+    }
+    else
+    {
+        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    }
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
     kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -360,6 +383,13 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
+        // post op argument buffers
+        for(size_t i = 0; i < _num_post_op_args; ++i)
+        {
+            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
+            add_2D_tensor_argument(idx, post_op_arg, slice);
+        }
+
         // K dimension (not used if _export_to_cl_image == true)
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
 
@@ -378,6 +408,12 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
+        // post op argument stride_z
+        for(size_t i = 0; i < _num_post_op_args; ++i)
+        {
+            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
+        }
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
         if(_reinterpret_output_as_3d)
         {
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index b320d318e9..09160ec0d1 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -106,6 +106,7 @@ private:
     bool         _add_bias{ false };
     bool         _export_to_cl_image{ false };
     unsigned int _k{ 1 };
+    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index e8affc0853..234a0df2aa 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -29,11 +29,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/common/utils/Log.h"
 
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLBatchNormalizationLayer::CLBatchNormalizationLayer()
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index fd12dea4fe..b13c380470 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/experimental/PostOp.h"
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
@@ -61,11 +62,21 @@ using CLGEMMMatrixMultiplyReshaped = CLSynthetizeOperator<ClGemmMatrixMultiplyRe
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedFixture = GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
 
+// Fixture for CLGEMMMatrixMultiplyReshaped with post ops
+template <typename T>
+using CLGEMMMatrixMultiplyReshapedWithPostOpsFixture =
+    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
+
 // Fixture for CLGEMMMatrixMultiplyReshaped mixed precision
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture =
     GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
 
+// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision with post ops
+template <typename T>
+using CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture =
+    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
+
 // Fixture for CLGEMMMatrixMultiplyReshaped3D
 template <typename T>
 using CLGEMMMatrixMultiplyReshaped3DFixture = GEMMMatrixMultiplyReshaped3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
@@ -172,6 +183,65 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
 
+/** Post Ops */
+using PostOpArgBroadcast =  CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>::PostOpArgBroadcast;
+experimental::PostOpList<PostOpArgBroadcast> empty_post_ops()
+{
+    return experimental::PostOpList<PostOpArgBroadcast>{};
+}
+
+experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
+        0,
+        ConvertPolicy::SATURATE);
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+    return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
+        1,
+        ConvertPolicy::SATURATE);
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+    return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(false, false, true),  // If broadcast in dims 0, 1 and 2
+        1,
+        ConvertPolicy::SATURATE);
+    return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> invalid_post_ops_1()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
+        1,
+        ConvertPolicy::SATURATE);
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(false, true, false),   // If broadcast in dims 0, 1 and 2
+        0,
+        ConvertPolicy::SATURATE);
+    return post_ops;
+}
+
+/** Different Post Op Lists */
+const auto post_op_lists = framework::dataset::make("post_op_lists", {
+    post_ops_1(),
+    post_ops_2(),
+    post_ops_3(),
+ } );
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -328,7 +398,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                framework::dataset::make("Expected", { true, true, false, false, false, true, true,true})),
                     input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
 {
-   ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+    ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
                                                           &input1_info.clone()->set_is_resizable(true),
                                                           &input2_info.clone()->set_is_resizable(true),
                                                           &output_info.clone()->set_is_resizable(true),1.f,1.f,
@@ -336,6 +406,116 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                           rhs_info,
                                                           gemm_info)) == expected, framework::LogLevel::ERRORS);
 }
+DATA_TEST_CASE(ValidateFusedPosOps, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("Input0Info", { TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32),      // OK. Empty post ops
+                                                        TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32),      // Invalid post op sequences
+                                                        TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32),      // OK. Supported post ops
+
+                                                      }),
+               framework::dataset::make("Input1Info",{ TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32),
+
+                      })),
+               framework::dataset::make("Input2Info", { TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+
+                                                      })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32),
+
+                           })),
+               framework::dataset::make("LHSMInfo",{
+                                                          GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                          GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                          GEMMLHSMatrixInfo(4,4,1,false,true),
+
+                                })),
+               framework::dataset::make("RHSMInfo",{
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+
+
+                           })),
+
+
+               framework::dataset::make("GEMMInfo",{
+                                                            GEMMKernelInfo( 17 /**<M Number of LHS rows*/,
+                                                                            21 /**<N Number of RHS columns*/,
+                                                                            13 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                                     false /**< reinterpret the input as 3D */,
+                                                                     true  /**< Flag used to broadcast the bias addition */,
+                                                                     false /**< wider accumm */,
+                                                                     false /**< has pad y */,
+                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                                     1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                                     1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                                     GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                                     0  /**< Offset to be added to each element of the matrix A */,
+                                                                     0 /**< Offset to be added to each element of the matrix B */),
+
+                                                            GEMMKernelInfo( 17 /**<M Number of LHS rows*/,
+                                                                            21 /**<N Number of RHS columns*/,
+                                                                            13 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                                     false /**< reinterpret the input as 3D */,
+                                                                     true  /**< Flag used to broadcast the bias addition */,
+                                                                     false /**< wider accumm */,
+                                                                     false /**< has pad y */,
+                                                                   ActivationLayerInfo::ActivationFunction::IDENTITY,
+                                                                     1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                                     1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                                     GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                                     0  /**< Offset to be added to each element of the matrix A */,
+                                                                     0 /**< Offset to be added to each element of the matrix B */),
+                                                            GEMMKernelInfo( 17 /**<M Number of LHS rows*/,
+                                                                            21 /**<N Number of RHS columns*/,
+                                                                            13 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                                     false /**< reinterpret the input as 3D */,
+                                                                     true  /**< Flag used to broadcast the bias addition */,
+                                                                     false /**< wider accumm */,
+                                                                     false /**< has pad y */,
+                                                                   ActivationLayerInfo::ActivationFunction::IDENTITY,
+                                                                     1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                                     1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                                     GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                                     0  /**< Offset to be added to each element of the matrix A */,
+                                                                     0 /**< Offset to be added to each element of the matrix B */),
+                                                    })),
+               framework::dataset::make("PostOps",{
+                                                          empty_post_ops(),
+                                                          invalid_post_ops_1(),
+                                                          post_ops_1(),
+                           })),
+               framework::dataset::make("Expected", { true, false, true})),
+                    input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, post_ops, expected)
+{
+    // Create TensorInfo for post op arguments
+    std::vector<TensorInfo> post_op_tensor_infos;
+    auto populated_post_ops = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, ITensorInfo*>(post_ops,
+        [&output_info, &post_op_tensor_infos](auto broadcast){
+                post_op_tensor_infos.emplace_back(TensorShape{
+                        std::get<0>(broadcast) ? 1 : output_info.dimension(0),
+                        std::get<1>(broadcast) ? 1 : output_info.dimension(1),
+                        std::get<2>(broadcast) ? 1 : output_info.dimension(2)
+                        }, 1, output_info.data_type());
+                return &post_op_tensor_infos.back();
+            });
+    GEMMKernelInfo gemm_info_with_post_ops(std::move(gemm_info));
+    gemm_info_with_post_ops.post_ops = populated_post_ops;
+    ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+                                                          &input1_info.clone()->set_is_resizable(true),
+                                                          &input2_info.clone()->set_is_resizable(true),
+                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
+                                                          lhs_info,
+                                                          rhs_info,
+                                                          gemm_info_with_post_ops)) == expected, framework::LogLevel::ERRORS);
+}
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -438,6 +618,37 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -704,6 +915,45 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
 
@@ -809,6 +1059,37 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
 
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -1075,6 +1356,45 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP16
 
@@ -1179,6 +1499,38 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
 }
+
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", { true, false })),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+}
+
+TEST_SUITE_END() // FusedPostOps
+
 TEST_SUITE_END() // MixedPrecision
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMultiplyReshaped
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 5f5fa3b653..e1191587d5 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -27,6 +27,8 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/IPostOp.h"
+#include "src/core/experimental/PostOp.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -34,7 +36,9 @@
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/PostOps.h"
 
 #include <random>
 
@@ -915,6 +919,263 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
+/** (EXPERIMENTAL_POST_OPS)*/
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
+class GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture : public framework::Fixture
+{
+public:
+    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
+public:
+    template <typename...>
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
+               bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info,
+               const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
+    {
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0         = m0;
+        lhs_info.k0         = k0;
+        lhs_info.v0         = v0;
+        lhs_info.interleave = interleave_lhs;
+        lhs_info.transpose  = lhs_transpose;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.h0                 = h0;
+        rhs_info.interleave         = interleave_rhs;
+        rhs_info.transpose          = !lhs_transpose;
+        rhs_info.export_to_cl_image = export_to_cl_image;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+        const TensorShape bias_shape(n,
+                                     broadcast_bias ? 1 : m,
+                                     broadcast_bias ? 1 : batch_size);
+        auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
+                                                                                                                    [ = ](auto broadcast)
+        {
+            return TensorShape
+            {
+                std::get<0>(broadcast) ? 1 : n,
+                std::get<1>(broadcast) ? 1 : m,
+                std::get<2>(broadcast) ? 1 : batch_size,
+            };
+        });
+
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+        if(validate_result)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+        }
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+        DistributionType distribution{ T(-1.0f), T(1.0f) };
+        library->fill(tensor, distribution, i);
+
+        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+        library->fill_borders_with_garbage(tensor, distribution_inf, i);
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+    {
+        // Create tensors
+        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
+        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
+        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+
+        // Create post op tensors and populate post op with them
+        std::vector<TensorType> post_op_tensors_holder{};
+        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
+                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
+        {
+            auto t = create_tensor<TensorType>(shape, data_type, 1);
+            post_op_tensors_holder.push_back(std::move(t));
+            return post_op_tensors_holder.back().info();
+        });
+        TensorType lhs_reshaped;
+        TensorType rhs_reshaped;
+        TensorType dst;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+        GEMMKernelInfo     kernel_info;
+        kernel_info.m                       = M;
+        kernel_info.n                       = N;
+        kernel_info.k                       = K;
+        kernel_info.depth_output_gemm3d     = 0;
+        kernel_info.reinterpret_input_as_3d = false;
+        kernel_info.broadcast_bias          = broadcast_bias;
+        kernel_info.activation_info         = act_info;
+        kernel_info.fp_mixed_precision      = fp_mixed_precision;
+        kernel_info.post_ops                = populated_post_ops;
+
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        validate_result = validate_result || !rhs_info.export_to_cl_image;
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        for(const auto &tensor : post_op_tensors_holder)
+        {
+            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
+        }
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
+            for(auto &tensor : post_op_tensors_holder)
+            {
+                add_padding_x({ &tensor });
+            }
+        }
+
+        // Allocate tensors
+        lhs.allocator()->allocate();
+        rhs.allocator()->allocate();
+        lhs_reshaped.allocator()->allocate();
+        rhs_reshaped.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        for(auto &tensor : post_op_tensors_holder)
+        {
+            tensor.allocator()->allocate();
+        }
+
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        for(const auto &tensor : post_op_tensors_holder)
+        {
+            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
+        }
+
+        // Fill tensors
+        fill(AccessorType(lhs), 0);
+        fill(AccessorType(rhs), 1);
+        fill(AccessorType(bias), 2);
+        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+        {
+            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
+        }
+
+        // Compute GEMM
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst }
+        });
+        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+        {
+            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
+        }
+        gemm.run(gemm_pack);
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
+                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+    {
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        // Create reference
+        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
+        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
+        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
+        // Create post op tensors and populate post op with them
+        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
+        {
+            return SimpleTensor<T> { shape, data_type, 1 };
+        });
+
+        const int n          = rhs_shape[0];
+        const int m          = lhs_shape[1];
+        const int batch_size = lhs_shape[2];
+
+        // Fill reference
+        int tensor_idx = 0;
+        fill(lhs, tensor_idx++);
+        fill(rhs, tensor_idx++);
+        fill(bias, tensor_idx++);
+        for(auto &op : populated_post_ops.get_list())
+        {
+            for(auto tensor : op->arguments())
+            {
+                fill(*tensor, tensor_idx++);
+            }
+        }
+
+        if(broadcast_bias)
+        {
+            // In case of broadcast, we need simply copy the first into the following "M" ones
+            for(int i = 1; i < m * batch_size; i++)
+            {
+                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
+            }
+        }
+
+        SimpleTensor<T> out;
+        if(fp_mixed_precision)
+        {
+            out = reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta);
+        }
+        else
+        {
+            out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
+        }
+        // Ignore activation info if post ops are used instead
+        if(populated_post_ops.size() > 0)
+        {
+            out = reference::post_ops<T>(out, populated_post_ops);
+        }
+        else
+        {
+            out = reference::activation_layer(out, act_info);
+        }
+        return out;
+    }
+
+    bool            validate_result = true;
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
 class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
 {
diff --git a/tests/validation/reference/PostOps.cpp b/tests/validation/reference/PostOps.cpp
new file mode 100644
index 0000000000..1a8fb990c8
--- /dev/null
+++ b/tests/validation/reference/PostOps.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PostOps.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/experimental/PostOp.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops)
+{
+    // Create reference
+    SimpleTensor<T> dst{ a };
+
+    for(auto &post_op : post_ops.get_list())
+    {
+        switch(post_op->type())
+        {
+            case experimental::PostOpType::Activation:
+            {
+                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<SimpleTensor<T>> *>(post_op.get());
+                dst                 = reference::activation_layer(dst, _post_op->_act_info);
+                break;
+            }
+            case experimental::PostOpType::Eltwise_Add:
+            {
+                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<SimpleTensor<T>> *>(post_op.get());
+                dst                 = reference::arithmetic_operation(ArithmeticOperation::ADD, dst, _post_op->_addend, dst, _post_op->_policy);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported PostOpType");
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> post_ops(const SimpleTensor<float> &a, experimental::PostOpList<SimpleTensor<float>> post_ops);
+template SimpleTensor<half> post_ops(const SimpleTensor<half> &a, experimental::PostOpList<SimpleTensor<half>> post_ops);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/reference/PostOps.h b/tests/validation/reference/PostOps.h
new file mode 100644
index 0000000000..5fe0fe71f5
--- /dev/null
+++ b/tests/validation/reference/PostOps.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_POSTOPS_H
+#define ARM_COMPUTE_TEST_POSTOPS_H
+
+#include "arm_compute/core/experimental/IPostOp.h"
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+/** (EXPERIMENTAL_POST_OPS) */
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_POSTOPS_H */
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 220c3ac189..64694f0e7c 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -36,12 +36,13 @@
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "src/core/experimental/PostOp.h"
 #include "support/StringSupport.h"
-
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -135,6 +136,133 @@ std::string to_string(const std::vector<T> &args)
     return str.str();
 }
 
+/** @name (EXPERIMENTAL_POST_OPS)
+ * @{
+ */
+/** Formmated output of the @ref experimental::PostOpType type
+ *
+ * @param[out] os           Output stream.
+ * @param[in]  post_op_type Type to output.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, experimental::PostOpType post_op_type)
+{
+    os << "type=";
+    switch(post_op_type)
+    {
+        case experimental::PostOpType::Activation:
+        {
+            os << "Activation";
+            break;
+        }
+        case experimental::PostOpType::Eltwise_Add:
+        {
+            os << "Eltwise_Add";
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported PostOpType");
+            break;
+        }
+    }
+    return os;
+}
+/** Converts a @ref experimental::PostOpType to string
+ *
+ * @param[in] post_op_type PostOpType value to be converted
+ *
+ * @return String representing the corresponding PostOpType
+ */
+inline std::string to_string(experimental::PostOpType post_op_type)
+{
+    std::stringstream str;
+    str << post_op_type;
+    return str.str();
+}
+/** Formatted output of the @ref experimental::IPostOp type.
+ *
+ * @param[out] os      Output stream.
+ * @param[in]  post_op Type to output.
+ *
+ * @return Modified output stream.
+ */
+template <typename T>
+inline ::std::ostream &operator<<(::std::ostream &os, const experimental::IPostOp<T> &post_op)
+{
+    os << "<";
+    os << post_op.type() << ",";
+    os << "prev_op_arg_pos=" << post_op.prev_dst_pos() << ",";
+    switch(post_op.type())
+    {
+        case experimental::PostOpType::Activation:
+        {
+            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<T> *>(&post_op);
+            os << "act_info=" << &(_post_op->_act_info);
+            break;
+        }
+        case experimental::PostOpType::Eltwise_Add:
+        {
+            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<T> *>(&post_op);
+            os << "convert_policy=" << _post_op->_policy;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported PostOpType");
+            break;
+        }
+    }
+    os << ">";
+    return os;
+}
+/** Converts an @ref experimental::IPostOp to string
+ *
+ * @param[in] post_op IPostOp value to be converted
+ *
+ * @return String representing the corresponding IPostOp
+ */
+template <typename T>
+inline std::string to_string(const experimental::IPostOp<T> &post_op)
+{
+    std::stringstream str;
+    str << post_op;
+    return str.str();
+}
+/** Formatted output of the @ref experimental::PostOpList type.
+ *
+ * @param[out] os       Output stream.
+ * @param[in]  post_ops Type to output.
+ *
+ * @return Modified output stream.
+ */
+template <typename T>
+inline ::std::ostream &operator<<(::std::ostream &os, const experimental::PostOpList<T> &post_ops)
+{
+    os << "[";
+    for(const auto &post_op : post_ops.get_list())
+    {
+        os << *post_op << ",";
+    }
+    os << "]";
+    return os;
+}
+/** Converts a @ref experimental::PostOpList to string
+ *
+ * @param[in] post_ops PostOpList value to be converted
+ *
+ * @return String representing the corresponding PostOpList
+ */
+template <typename T>
+inline std::string to_string(const experimental::PostOpList<T> &post_ops)
+{
+    std::stringstream str;
+    str << post_ops;
+    return str.str();
+}
+/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
+
 /** Formatted output of the Dimensions type.
  *
  * @param[out] os         Output stream.
@@ -244,8 +372,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMKernelInfo &gemm
     os << " fp_mixed_precision= " << gemm_info.fp_mixed_precision;
     os << " mult_transpose1xW_width= " << gemm_info.mult_transpose1xW_width;
     os << " mult_interleave4x4_height= " << gemm_info.mult_interleave4x4_height;
-    os << " a_offset = " << gemm_info.a_offset;
-    os << " b_offset = " << gemm_info.b_offset;
+    os << " a_offset= " << gemm_info.a_offset;
+    os << " b_offset= " << gemm_info.b_offset;
+    os << "post_ops= " << gemm_info.post_ops;
     os << ")";
     return os;
 }
@@ -487,7 +616,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo:
 
 /** Formatted output of the activation function info type.
  *
- * @param[in] info Type to output.
+ * @param[in] info ActivationLayerInfo to output.
  *
  * @return Formatted string.
  */
@@ -501,25 +630,35 @@ inline std::string to_string(const arm_compute::ActivationLayerInfo &info)
     return str.str();
 }
 
-/** Formatted output of the activation function info type.
+/** Formatted output of the activation function info.
  *
- * @param[in] info Type to output.
+ * @param[out] os   Output stream.
+ * @param[in]  info ActivationLayerInfo to output.
  *
  * @return Formatted string.
  */
-inline std::string to_string(const arm_compute::ActivationLayerInfo *info)
+inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo *info)
 {
-    std::string ret_str = "nullptr";
     if(info != nullptr)
     {
-        std::stringstream str;
         if(info->enabled())
         {
-            str << info->activation();
+            os << info->activation();
+            os << "(";
+            os << "VAL_A=" << info->a() << ",";
+            os << "VAL_B=" << info->b();
+            os << ")";
+        }
+        else
+        {
+            os << "disabled";
         }
-        ret_str = str.str();
     }
-    return ret_str;
+    else
+    {
+        os << "nullptr";
+    }
+    return os;
 }
 
 /** Formatted output of the activation function type.
-- 
cgit v1.2.1