From 94abde4f4e98f6f1adb5c46b194527f34a8ea07d Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Thu, 25 May 2023 16:48:43 +0100
Subject: Add Fused Activation to OpenCL MatMul

- Added fused activation to MatMul function interface
- Added fused activation to CL backend
- Includes tests for supported Activation Functions in MatMul

Resolves: [COMPMID-6192]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: Ie103212b600b60699eaf6a6394d609e6e1f5aba6
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/522465
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9714
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/MatMulInfo.h                      |  12 --
 arm_compute/runtime/CL/functions/CLMatMul.h        |  22 ++-
 arm_compute/runtime/NEON/functions/NEMatMul.h      |  17 +-
 src/core/CL/cl_kernels/common/mat_mul.cl           |  27 ++-
 src/core/CL/cl_kernels/common/mat_mul_quantized.cl |  15 +-
 src/cpu/operators/CpuMatMul.cpp                    |  14 +-
 src/cpu/operators/CpuMatMul.h                      |   6 +-
 src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp    |  50 +++---
 src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h      |  21 ++-
 src/gpu/cl/kernels/ClMatMulNativeKernel.cpp        |  17 +-
 src/gpu/cl/kernels/ClMatMulNativeKernel.h          |  20 ++-
 src/gpu/cl/operators/ClMatMul.cpp                  |  16 +-
 src/gpu/cl/operators/ClMatMul.h                    |  18 +-
 src/runtime/CL/functions/CLMatMul.cpp              |  13 +-
 src/runtime/NEON/functions/NEMatMul.cpp            |  10 +-
 tests/validation/CL/MatMul.cpp                     | 182 ++++++++++++++-------
 tests/validation/fixtures/MatMulFixture.h          |  60 +++++--
 utils/TypePrinter.h                                |   6 +-
 18 files changed, 335 insertions(+), 191 deletions(-)
diff --git a/arm_compute/core/MatMulInfo.h b/arm_compute/core/MatMulInfo.h
index 62d782215b..01b9b47761 100644
--- a/arm_compute/core/MatMulInfo.h
+++ b/arm_compute/core/MatMulInfo.h
@@ -58,11 +58,6 @@ public:
     {
         return _adj_rhs;
     }
-    /* Get Fused Activation Layer Info */
-    ActivationLayerInfo fused_activation() const
-    {
-        return _fused_act;
-    }
     /* Set Adjoint LHS flag */
     MatMulInfo &adj_lhs(bool adj_lhs)
     {
@@ -75,17 +70,10 @@ public:
         _adj_rhs = adj_rhs;
         return *this;
     }
-    /* Set Fused Activation Layer Info */
-    MatMulInfo &fused_activation(const ActivationLayerInfo &act_info)
-    {
-        _fused_act = act_info;
-        return *this;
-    }
 
 private:
     bool                _adj_lhs{ false };
     bool                _adj_rhs{ false };
-    ActivationLayerInfo _fused_act{}; // disabled by default
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_MATMULINFO_H */
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
index 2af9a4a9a6..a11c1ed6a2 100644
--- a/arm_compute/runtime/CL/functions/CLMatMul.h
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -24,6 +24,8 @@
 #ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
 #define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
 
+#include "arm_compute/core/ActivationLayerInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include <memory>
 
@@ -83,21 +85,29 @@ public:
      * @param[in]  rhs             Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs.
      * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
-     * @param[in]  settings        Class containing flags for function level settings
+     * @param[in]  settings        Contains flags for function level settings
+     * @param[in]  act_info        (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const
+                   ActivationLayerInfo &act_info = ActivationLayerInfo{});
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLMatMul::configure()
      */
-    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const ActivationLayerInfo &act_info =
+                       ActivationLayerInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
      *
-     * Similar to @ref CLMatMul::configure()
      *
-     * @return a status
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  lhs         Left-hand side (Matrix A) tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs         Right-hand side (Matrix B) tensor info. Data types supported: same as @p lhs.
+     * @param[out] output      Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  act_info    (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
index a331c55a98..81fec19f86 100644
--- a/arm_compute/runtime/NEON/functions/NEMatMul.h
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -24,6 +24,8 @@
 #ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
 #define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
 
+#include "arm_compute/core/ActivationLayerInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include <memory>
 
@@ -91,16 +93,23 @@ public:
      * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
      * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
      * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
-     * @param[in]  settings Class containing flags for function level settings i.e fast math
+     * @param[in]  settings Contains flags for function level settings i.e fast math
+     * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
      *
-     * Parameters are similar to @ref NEMatMul::configure()
+     * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst      Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings Contains flags for function level settings i.e fast math
+     * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      *
      * @return Status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden
     void run() override;
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
index 90d485e815..9656a59728 100644
--- a/src/core/CL/cl_kernels/common/mat_mul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "activation_float_helpers.h"
 #include "helpers.h"
 #include "tile_helpers.h"
 
@@ -31,6 +32,7 @@
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -86,7 +88,7 @@ __kernel void mat_mul_native_nt_nt(
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, M0, K0, a);
@@ -111,7 +113,7 @@ __kernel void mat_mul_native_nt_nt(
         lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
     }
 
-#ifdef K % K0 != 0
+#if K % K0 != 0
     /* Leftover Loop */
     for(; k < K; ++k)
     {
@@ -147,6 +149,8 @@ __kernel void mat_mul_native_nt_nt(
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_NT_NT)
@@ -158,6 +162,7 @@ __kernel void mat_mul_native_nt_nt(
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -213,7 +218,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, M0, K0, a);
@@ -301,6 +306,8 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_NT_T)
@@ -312,6 +319,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -367,7 +375,7 @@ __kernel void mat_mul_native_t_nt(
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, K0, M0, a);
@@ -405,7 +413,7 @@ __kernel void mat_mul_native_t_nt(
         lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
     }
 
-#ifdef K % K0 != 0
+#if K % K0 != 0
     /* Leftover Loop */
     for(; k < K; ++k)
     {
@@ -451,6 +459,8 @@ __kernel void mat_mul_native_t_nt(
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_T_NT)
@@ -462,6 +472,7 @@ __kernel void mat_mul_native_t_nt(
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -517,7 +528,7 @@ __kernel void mat_mul_native_t_t(
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, K0, M0, a);
@@ -565,7 +576,7 @@ __kernel void mat_mul_native_t_t(
         lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
     }
 
-#ifdef K % K0 != 0
+#if K % K0 != 0
     /* Leftover Loop */
     for(; k < K; ++k)
     {
@@ -619,6 +630,8 @@ __kernel void mat_mul_native_t_t(
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
index 0c3cbca9a6..bd415bb4a7 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -23,6 +23,7 @@
  */
 #include "helpers.h"
 #include "tile_helpers.h"
+#include "activation_float_helpers.h"
 
 #if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
 /** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
@@ -32,6 +33,7 @@
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -194,6 +196,8 @@ __kernel void mat_mul_native_quantized_nt_nt(
     const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     // Quantize the tile
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -216,6 +220,7 @@ __kernel void mat_mul_native_quantized_nt_nt(
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -315,7 +320,7 @@ __kernel void mat_mul_native_quantized_nt_t(
         rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
     }
 
-#if ((K % K0) != 0)
+#if((K % K0) != 0)
     // Leftover loop
     for(; k < K; ++k)
     {
@@ -370,6 +375,8 @@ __kernel void mat_mul_native_quantized_nt_t(
     const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     // Quantize the tile
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -392,6 +399,7 @@ __kernel void mat_mul_native_quantized_nt_t(
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -548,6 +556,8 @@ __kernel void mat_mul_native_quantized_t_nt(
     const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     // Quantize the tile
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -570,6 +580,7 @@ __kernel void mat_mul_native_quantized_t_nt(
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -731,6 +742,8 @@ __kernel void mat_mul_native_quantized_t_t(
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
     // Quantize the tile
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
 
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 87cb6c6b54..515b511044 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -25,9 +25,9 @@
 #include "src/cpu/operators/CpuMatMul.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
 #include "src/common/utils/Log.h"
@@ -45,7 +45,6 @@ namespace cpu
 {
 namespace
 {
-
 Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
 {
@@ -74,15 +73,14 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
 
     return Status{};
 }
-
-}
+} // namespace
 
 CpuMatMul::CpuMatMul()
     : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
 {
 }
 
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -100,7 +98,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     TensorInfo         rhs_transposed{};
 
     auto gemm_info            = AsmGemmInfo();
-    gemm_info.activation_info = info.fused_activation();
+    gemm_info.activation_info = act_info;
     gemm_info.fast_mode       = settings.fast_math();
 
     // Validate and then permute a/b
@@ -139,7 +137,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     return Status{};
 }
 
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -189,7 +187,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     // -----------------------------------------------------
     // Use transposed tensors if the corresponding transpose flags are set
     // Fill AsmGemmInfo class object before configuration
-    _gemm_info.activation_info = info.fused_activation();
+    _gemm_info.activation_info = act_info;
     _gemm_info.fast_mode       = settings.fast_math();
     _gemm_info.negated_offsets = false;
 
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 9f5833b24f..475c019fd0 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -64,15 +64,17 @@ public:
      * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
      * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  settings The settings for matmul operation (i.e fast math)
+     * @param[in]  act_info Class containing information about fused activation function.
      */
-    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index d5ecdf7dd2..9bbec908a3 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -98,34 +98,36 @@ ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
     return Status{};
 }
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                                         const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info));
 
     // output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
-    const int  m       = output->dimension(1);
-    const int  n       = output->dimension(0);
+    const int  m       = dst->dimension(1);
+    const int  n       = dst->dimension(0);
     const int  k       = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
     const bool adj_lhs = matmul_kernel_info.adj_lhs;
 
@@ -133,7 +135,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
 
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(n0, m0));
+    Window win = calculate_max_window(*dst, Steps(n0, m0));
     win        = win.collapse(win, Window::DimZ);
     IClKernel::configure_internal(win);
 
@@ -152,7 +154,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
 
     const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
     const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
-    const UniformQuantizationInfo dqinfo = output->quantization_info().uniform();
+    const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
 
     float multiplier        = lqinfo.scale * rqinfo.scale / dqinfo.scale;
     int   output_multiplier = 0;
@@ -166,6 +168,10 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D
     build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));  // Passed as positive (unlike the above two)
 
+    build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+    build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
     std::string kernel_name("mat_mul_native_quantized");
     kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
     kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -177,7 +183,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set config_id for enabling LWS tuning
-    const size_t number_of_batches = output->tensor_shape().total_size() / (m * n);
+    const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
 
     _config_id = kernel_name;
     _config_id += "_";
@@ -203,18 +209,18 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    ICLTensor       *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
+    const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    ICLTensor       *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
 
     unsigned int idx              = 0;
     Window       window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    add_3d_tensor_nhw_argument(idx, output);
+    add_3d_tensor_nhw_argument(idx, dst);
 
     enqueue(queue, *this, window_collapsed, lws_hint());
 }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index d70ff30b91..67d1a6601f 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -24,6 +24,7 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 
+#include "arm_compute/core/ActivationLayerInfo.h"
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -43,22 +44,24 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel);
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[in]  rhs             Input tensor for the RHS matrix. Data type supported: same as @p lhs.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[out] dst             Output tensor info. Data type supported: same as @p lhs
-     * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  lhs                Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  rhs                Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[out] dst                Output tensor info. Data type supported: same as @p lhs
+     * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
+     * @param[in]  act_info           Class containing information about fused activation function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulLowpNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 8f53c1998f..205396a639 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -112,7 +112,7 @@ Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInf
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
     }
 
-    return Status {};
+    return Status{};
 }
 }
 ClMatMulNativeKernel::ClMatMulNativeKernel()
@@ -120,8 +120,9 @@ ClMatMulNativeKernel::ClMatMulNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -138,7 +139,8 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                                     const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
@@ -176,6 +178,11 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     build_opts.add_option("-DK=" + support::cpp11::to_string(k));
     build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER");
 
+    // Define values for activation function
+    build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+    build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
     std::string kernel_name("mat_mul_native");
     kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
     kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -218,8 +225,8 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
     ICLTensor       *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index f706256e31..02d8ac3067 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -42,22 +42,24 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel);
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             Input tensor for the LHS matrix. Data type supported: F32/F16.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[in]  rhs             Input tensor for the RHS matrix. Data type supported: same as @p lhs.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[out] dst             Output tensor info. Data type supported: same as @p lhs
-     * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  lhs                Input tensor for the LHS matrix. Data type supported: F32/F16.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  rhs                Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[out] dst                Output tensor info. Data type supported: same as @p lhs
+     * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
+     * @param[in]  act_info           Specifies activation function to use after Matrix multiplication. Default is Identity function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 3822c16aa1..c453761a8e 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -47,7 +47,7 @@ ClMatMul::ClMatMul()
 {
 }
 
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info)
+Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
@@ -57,15 +57,15 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
 
     std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
 
-    MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
+    const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+    const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
 
-    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) :
-                          ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info);
+    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info) :
+           ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info);
 }
 
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info)
+void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -86,14 +86,14 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
         _matmul_lowp_native_kernel->set_target(gpu_target);
 
         // Configure the low-precision native matrix multiply kernel
-        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
     }
     else
     {
         _matmul_native_kernel->set_target(gpu_target);
 
         // Configure the native matrix multiply kernel
-        _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+        _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
     }
 }
 
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 6aba801301..9dce5288e6 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
-#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
 
 #include "arm_compute/core/ActivationLayerInfo.h"
 #include "arm_compute/core/MatMulInfo.h"
 #include "src/gpu/cl/IClOperator.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <memory>
 
@@ -71,24 +71,26 @@ public:
      * @param[in]  rhs             Right-hand side tensor info. Data types supported: same as @p lhs.
      * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  act_info        Class containing information about fused activation function.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{nullptr};
-    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{ nullptr };
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
 
     bool _is_quantized{ false };
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */
+#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index ae5a01f679..bef422fca1 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -42,25 +42,26 @@ CLMatMul::CLMatMul()
 
 CLMatMul::~CLMatMul() = default;
 
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(settings);
-    configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info);
+    configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
 }
 
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
     ARM_COMPUTE_UNUSED(settings);
 
     _impl->op = std::make_unique<OperatorType>();
-    _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info);
+    _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
     _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
 }
 
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info)
+Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
 {
-    return OperatorType::validate(lhs, rhs, output, matmul_info);
+    return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
 }
 
 void CLMatMul::run()
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 0c46516f1e..58640f40ea 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
@@ -49,7 +49,7 @@ NEMatMul::NEMatMul()
 
 NEMatMul::~NEMatMul() = default;
 
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     _impl->lhs    = lhs;
     _impl->rhs    = rhs;
@@ -57,14 +57,14 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
     _impl->op = std::make_unique<cpu::CpuMatMul>();
-    _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings);
+    _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
     _impl->run_pack          = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
-    return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings);
+    return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
 }
 
 void NEMatMul::run()
diff --git a/tests/validation/CL/MatMul.cpp b/tests/validation/CL/MatMul.cpp
index 6364b16200..5a262a8e78 100644
--- a/tests/validation/CL/MatMul.cpp
+++ b/tests/validation/CL/MatMul.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
 
 #include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ActivationFunctionsDataset.h"
 #include "tests/framework/DatasetModes.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/TestCase.h"
@@ -44,11 +45,13 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float>             tolerance_f32(0.001f);      /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
-constexpr float                      abs_tolerance_f32(0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
-constexpr float                      abs_tolerance_f16(0.001f);  /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data type in case using relative tolerance fails because of small values */
-RelativeTolerance<half_float::half>  tolerance_f16(half(0.01));  /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
-constexpr AbsoluteTolerance<uint8_t> tolerance_quant(1);         /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+constexpr float          abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.001f);                                                    /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data type in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half>  tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+constexpr AbsoluteTolerance<uint8_t> tolerance_quant(1);        /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 } // namespace
 
 template <typename T>
@@ -57,25 +60,71 @@ using CLMatMulFixture = MatMulValidationFixture<CLTensor, CLAccessor, CLMatMul,
 template <typename T>
 using CLQuantizedMatMulFixture = QuantizedMatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
 
+template <typename T>
+using CLMatMulActivationFixture = MatMulValidationWithActivationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLMatMulActivationAlphaBetaFixture = MatMulValidationWithActivationAlphaBetaFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLQuantizedMatMulActivationFixture = QuantizedMatMulValidationWithActivationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+/* The main act functions matmul is expected to support */
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH)
+});
+
+const auto ActivationFunctionsQuantizedDataset = concat(concat(concat(
+                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo()),
+                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
+                                                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f))),
+                                                        framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f)));
+
+/* Larger activation functions dataset, used during some nightly tests. */
+const auto AllActivationsDataset = combine(datasets::ActivationFunctions(), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
+const auto AllQuantizedActivationsDataset = combine(concat(datasets::ActivationFunctionsQuantized(),
+                                                           framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+                                                                                                            ActivationLayerInfo::ActivationFunction::LEAKY_RELU
+                                                                                                          })),
+                                                    framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
 TEST_SUITE(CL)
 TEST_SUITE(MatMul)
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                        framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                        ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
                                                                                                                     framework::dataset::make("TransposeA", { false, true })),
-                                                                                                            framework::dataset::make("TransposeB", { false, true })),
-                                                                                                    framework::dataset::make("DataType", DataType::F32)))
+                                                                                                                    framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                                                                                  framework::dataset::make("TransposeA", { false, true })),
-                                                                                                          framework::dataset::make("TransposeB", { false, true })),
-                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunAllActivations, CLMatMulActivationAlphaBetaFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallerMatMulDataset(),
+                       framework::dataset::make("TransposeA", { false })),
+                       framework::dataset::make("TransposeB", { true })),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       AllActivationsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
@@ -85,19 +134,21 @@ TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
-                                                                                                           framework::dataset::make("TransposeB", { false, true })),
-                                                                                                   framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                       framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                       framework::dataset::make("TransposeB", { false, true })),
+                                                                                                               framework::dataset::make("DataType", DataType::F16)),
+                                                                                                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                                                                                 framework::dataset::make("TransposeA", { false, true })),
-                                                                                                         framework::dataset::make("TransposeB", { false, true })),
-                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                           ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
@@ -110,32 +161,30 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
-    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+                                                                                                                     datasets::SmallMatMulDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                 framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                 ActivationFunctionsQuantizedDataset),
+                                                                                                                 framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                 framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+                                                                                                                 framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+                                                                                                         framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::LargeMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
-    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+        datasets::LargeMatMulDataset(),
+        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                     framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                     ActivationFunctionsQuantizedDataset),
+                                                                                                                     framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                     framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+                                                                                                                     framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+                                                                                                             framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -146,32 +195,45 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::SmallMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
-    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+        datasets::SmallMatMulDataset(),
+        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                        framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                        ActivationFunctionsQuantizedDataset),
+                                                                                                                        framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                        framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+                                                                                                                framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+                                                                                                        framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
-    datasets::LargeMatMulDataset(),
-    framework::dataset::make("TransposeA", { false, true })),
-    framework::dataset::make("TransposeB", { false, true })),
-    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
-    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
-    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
-    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
-    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+                                                                                                                        datasets::LargeMatMulDataset(),
+                                                                                                                        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                    framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                    ActivationFunctionsQuantizedDataset),
+                                                                                                                    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+                                                                                                                    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+                                                                                                            framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunAllActivations, CLQuantizedMatMulActivationFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
+                           datasets::LargeMatMulDataset(),
+                           framework::dataset::make("TransposeA", { false })),
+                       framework::dataset::make("TransposeB", { true })),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       AllQuantizedActivationsDataset),
+                       framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                       framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+                       framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+                       framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h
index 2f94c1f9d2..3e4cac5e34 100644
--- a/tests/validation/fixtures/MatMulFixture.h
+++ b/tests/validation/fixtures/MatMulFixture.h
@@ -112,14 +112,14 @@ protected:
 
         // Configure MatMulInfo class
         MatMulInfo mm_info;
-        mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b).fused_activation(act_info);
+        mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b);
 
         // Ensure values are dynamic
         a.info()->set_are_values_constant(false);
         b.info()->set_are_values_constant(false);
 
         // Configure operator
-        matmul.configure(&a, &b, &dst, mm_info, settings);
+        matmul.configure(&a, &b, &dst, mm_info, settings, act_info);
 
         // Assertions
         ARM_COMPUTE_ASSERT(a.info()->is_resizable());
@@ -162,8 +162,8 @@ protected:
     }
 
     template <typename TT>
-    typename std::enable_if<!std::is_integral<TT>::value, SimpleTensor<TT>>::type
-    compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
+    typename std::enable_if < !std::is_integral<TT>::value, SimpleTensor<TT >>::type
+                                                                            compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
     {
         ARM_COMPUTE_UNUSED(o_qinfo);
 
@@ -172,7 +172,7 @@ protected:
 
     template <typename TT>
     typename std::enable_if<std::is_integral<TT>::value, SimpleTensor<TT>>::type
-    compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
+                                                                        compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
     {
         ARM_COMPUTE_UNUSED(alpha, beta);
 
@@ -183,18 +183,18 @@ protected:
         const auto multiplier = aq.scale * bq.scale / oq.scale;
 
         int32_t output_multiplier = 0;
-        int32_t output_shift = 0;
+        int32_t output_shift      = 0;
         quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
         std::vector<int32_t> output_multipliers{ output_multiplier };
         std::vector<int32_t> output_shifts{ output_shift };
 
         //The lhs and rhs offsets are negated here to keep the reference aligned with the function implementation where the lhs and rhs offsets are also negated.
         const auto tmp = reference::gemmlowp_matrix_multiply_core<int32_t>(
-            a, b, c.shape(), -aq.offset, -bq.offset);
+                             a, b, c.shape(), -aq.offset, -bq.offset);
 
         auto output = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TT>(
-            tmp, output_multipliers, output_shifts, oq.offset,
-            std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
+                          tmp, output_multipliers, output_shifts, oq.offset,
+                          std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
         output.quantization_info(o_qinfo);
 
         return output;
@@ -279,6 +279,30 @@ public:
     }
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulValidationWithDynamicTensorsFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs)
+    {
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class QuantizedMatMulValidationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs,
+               QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo)
+    {
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(),
+                                                                                                   a_qinfo, b_qinfo, o_qinfo);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
 class MatMulValidationWithActivationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
 {
@@ -291,24 +315,30 @@ public:
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
-class MatMulValidationWithDynamicTensorsFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+class MatMulValidationWithActivationAlphaBetaFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
 {
 public:
     template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo::ActivationFunction function,
+               float alpha_beta)
     {
-        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings());
+        ActivationLayerInfo act_info(function, alpha_beta, alpha_beta);
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, 0, Settings());
     }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
-class QuantizedMatMulValidationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+class QuantizedMatMulValidationWithActivationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
 {
 public:
     template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs, QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo::ActivationFunction function,
+               float alpha_beta, int num_extra_runs,
+               QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo)
     {
-        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(), a_qinfo, b_qinfo, o_qinfo);
+        ActivationLayerInfo act_info(function, alpha_beta, alpha_beta);
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(),
+                                                                                                   a_qinfo, b_qinfo, o_qinfo);
     }
 };
 
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 0df320d7e0..de8a960e41 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -35,11 +35,11 @@
 #include "arm_compute/core/GEMMInfo.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/MatMulInfo.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/MatMulInfo.h"
 #include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
@@ -3691,9 +3691,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::MatMulI
     os << "MatMulKernelInfo="
        << "["
        << "adj_lhs=" << matmul_info.adj_lhs() << ", "
-       << "adj_rhs=" << matmul_info.adj_rhs() << ", "
-       << "fused_activation=" << matmul_info.fused_activation().activation() << "]";
-
+       << "adj_rhs=" << matmul_info.adj_rhs() << "] ";
     return os;
 }
 /** Formatted output of the arm_compute::MatMulInfo type.
-- 
cgit v1.2.1