Add Fused Activation to OpenCL MatMul

- Added fused activation to MatMul function interface - Added fused activation to CL backend - Includes tests for supported Activation Functions in MatMul Resolves: [COMPMID-6192] Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: Ie103212b600b60699eaf6a6394d609e6e1f5aba6 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/522465 Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9714 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> 2023-05-25 16:48:43 +0100
committer: Mohmun02 <MohammedSuhail.Munshi@arm.com> 2023-06-16 15:38:39 +0000
commit: 94abde4f4e98f6f1adb5c46b194527f34a8ea07d (patch)
tree: d6d717031788850d970fb44ff3f41de311cc5fc0 /src
parent: dd8d7f4102653ef55d872c71ae5d5f2ca2ead0c1 (diff)
download: ComputeLibrary-94abde4f4e98f6f1adb5c46b194527f34a8ea07d.tar.gz
12 files changed, 137 insertions, 90 deletions
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
index 90d485e815..9656a59728 100644
--- a/src/core/CL/cl_kernels/common/mat_mul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "activation_float_helpers.h"
 #include "helpers.h"
 #include "tile_helpers.h"
 
@@ -31,6 +32,7 @@
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -86,7 +88,7 @@ __kernel void mat_mul_native_nt_nt(
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, M0, K0, a);
@@ -111,7 +113,7 @@ __kernel void mat_mul_native_nt_nt(
         lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
     }
 
-#ifdef K % K0 != 0
+#if K % K0 != 0
     /* Leftover Loop */
     for(; k < K; ++k)
     {
@@ -147,6 +149,8 @@ __kernel void mat_mul_native_nt_nt(
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_NT_NT)
@@ -158,6 +162,7 @@ __kernel void mat_mul_native_nt_nt(
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -213,7 +218,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, M0, K0, a);
@@ -301,6 +306,8 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_NT_T)
@@ -312,6 +319,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -367,7 +375,7 @@ __kernel void mat_mul_native_t_nt(
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, K0, M0, a);
@@ -405,7 +413,7 @@ __kernel void mat_mul_native_t_nt(
         lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
     }
 
-#ifdef K % K0 != 0
+#if K % K0 != 0
     /* Leftover Loop */
     for(; k < K; ++k)
     {
@@ -451,6 +459,8 @@ __kernel void mat_mul_native_t_nt(
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_T_NT)
@@ -462,6 +472,7 @@ __kernel void mat_mul_native_t_nt(
  *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -517,7 +528,7 @@ __kernel void mat_mul_native_t_t(
     })
 
     const int rhs_z = z * rhs_h;
-    int k;
+    int       k;
     for(k = 0; k <= K - K0; k += K0)
     {
         TILE(DATA_TYPE, K0, M0, a);
@@ -565,7 +576,7 @@ __kernel void mat_mul_native_t_t(
         lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
     }
 
-#ifdef K % K0 != 0
+#if K % K0 != 0
     /* Leftover Loop */
     for(; k < K; ++k)
     {
@@ -619,6 +630,8 @@ __kernel void mat_mul_native_t_t(
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
 }
 #endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
index 0c3cbca9a6..bd415bb4a7 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -23,6 +23,7 @@
  */
 #include "helpers.h"
 #include "tile_helpers.h"
+#include "activation_float_helpers.h"
 
 #if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
 /** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
@@ -32,6 +33,7 @@
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -194,6 +196,8 @@ __kernel void mat_mul_native_quantized_nt_nt(
     const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     // Quantize the tile
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -216,6 +220,7 @@ __kernel void mat_mul_native_quantized_nt_nt(
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -315,7 +320,7 @@ __kernel void mat_mul_native_quantized_nt_t(
         rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
     }
 
-#if ((K % K0) != 0)
+#if((K % K0) != 0)
     // Leftover loop
     for(; k < K; ++k)
     {
@@ -370,6 +375,8 @@ __kernel void mat_mul_native_quantized_nt_t(
     const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     // Quantize the tile
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -392,6 +399,7 @@ __kernel void mat_mul_native_quantized_nt_t(
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -548,6 +556,8 @@ __kernel void mat_mul_native_quantized_t_nt(
     const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     // Quantize the tile
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -570,6 +580,7 @@ __kernel void mat_mul_native_quantized_t_nt(
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
  * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
  * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
  * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -731,6 +742,8 @@ __kernel void mat_mul_native_quantized_t_t(
     const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
 
     // Quantize the tile
+    T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
     TILE(DATA_TYPE, M0, N0, accq);
     T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
 
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 87cb6c6b54..515b511044 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -25,9 +25,9 @@
 #include "src/cpu/operators/CpuMatMul.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
 #include "src/common/utils/Log.h"
@@ -45,7 +45,6 @@ namespace cpu
 {
 namespace
 {
-
 Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
 {
@@ -74,15 +73,14 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
 
     return Status{};
 }
-
-}
+} // namespace
 
 CpuMatMul::CpuMatMul()
     : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
 {
 }
 
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -100,7 +98,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     TensorInfo         rhs_transposed{};
 
     auto gemm_info            = AsmGemmInfo();
-    gemm_info.activation_info = info.fused_activation();
+    gemm_info.activation_info = act_info;
     gemm_info.fast_mode       = settings.fast_math();
 
     // Validate and then permute a/b
@@ -139,7 +137,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     return Status{};
 }
 
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -189,7 +187,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     // -----------------------------------------------------
     // Use transposed tensors if the corresponding transpose flags are set
     // Fill AsmGemmInfo class object before configuration
-    _gemm_info.activation_info = info.fused_activation();
+    _gemm_info.activation_info = act_info;
     _gemm_info.fast_mode       = settings.fast_math();
     _gemm_info.negated_offsets = false;
 
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 9f5833b24f..475c019fd0 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -64,15 +64,17 @@ public:
      * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
      * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  settings The settings for matmul operation (i.e fast math)
+     * @param[in]  act_info Class containing information about fused activation function.
      */
-    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index d5ecdf7dd2..9bbec908a3 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -98,34 +98,36 @@ ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
     return Status{};
 }
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                                         const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info));
 
     // output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
-    const int  m       = output->dimension(1);
-    const int  n       = output->dimension(0);
+    const int  m       = dst->dimension(1);
+    const int  n       = dst->dimension(0);
     const int  k       = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
     const bool adj_lhs = matmul_kernel_info.adj_lhs;
 
@@ -133,7 +135,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
 
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(n0, m0));
+    Window win = calculate_max_window(*dst, Steps(n0, m0));
     win        = win.collapse(win, Window::DimZ);
     IClKernel::configure_internal(win);
 
@@ -152,7 +154,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
 
     const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
     const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
-    const UniformQuantizationInfo dqinfo = output->quantization_info().uniform();
+    const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
 
     float multiplier        = lqinfo.scale * rqinfo.scale / dqinfo.scale;
     int   output_multiplier = 0;
@@ -166,6 +168,10 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D
     build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));  // Passed as positive (unlike the above two)
 
+    build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+    build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
     std::string kernel_name("mat_mul_native_quantized");
     kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
     kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -177,7 +183,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set config_id for enabling LWS tuning
-    const size_t number_of_batches = output->tensor_shape().total_size() / (m * n);
+    const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
 
     _config_id = kernel_name;
     _config_id += "_";
@@ -203,18 +209,18 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    ICLTensor       *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
+    const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    ICLTensor       *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
 
     unsigned int idx              = 0;
     Window       window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    add_3d_tensor_nhw_argument(idx, output);
+    add_3d_tensor_nhw_argument(idx, dst);
 
     enqueue(queue, *this, window_collapsed, lws_hint());
 }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index d70ff30b91..67d1a6601f 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -24,6 +24,7 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 
+#include "arm_compute/core/ActivationLayerInfo.h"
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -43,22 +44,24 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel);
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[in]  rhs             Input tensor for the RHS matrix. Data type supported: same as @p lhs.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[out] dst             Output tensor info. Data type supported: same as @p lhs
-     * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  lhs                Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  rhs                Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[out] dst                Output tensor info. Data type supported: same as @p lhs
+     * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
+     * @param[in]  act_info           Class containing information about fused activation function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulLowpNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 8f53c1998f..205396a639 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -112,7 +112,7 @@ Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInf
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
     }
 
-    return Status {};
+    return Status{};
 }
 }
 ClMatMulNativeKernel::ClMatMulNativeKernel()
@@ -120,8 +120,9 @@ ClMatMulNativeKernel::ClMatMulNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -138,7 +139,8 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                                     const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
@@ -176,6 +178,11 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     build_opts.add_option("-DK=" + support::cpp11::to_string(k));
     build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER");
 
+    // Define values for activation function
+    build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+    build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
     std::string kernel_name("mat_mul_native");
     kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
     kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -218,8 +225,8 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
     ICLTensor       *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index f706256e31..02d8ac3067 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -42,22 +42,24 @@ public:
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel);
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             Input tensor for the LHS matrix. Data type supported: F32/F16.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[in]  rhs             Input tensor for the RHS matrix. Data type supported: same as @p lhs.
-     *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[out] dst             Output tensor info. Data type supported: same as @p lhs
-     * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  lhs                Input tensor for the LHS matrix. Data type supported: F32/F16.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  rhs                Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[out] dst                Output tensor info. Data type supported: same as @p lhs
+     * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
+     * @param[in]  act_info           Specifies activation function to use after Matrix multiplication. Default is Identity function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 3822c16aa1..c453761a8e 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -47,7 +47,7 @@ ClMatMul::ClMatMul()
 {
 }
 
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info)
+Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
@@ -57,15 +57,15 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
 
     std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
 
-    MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
+    const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+    const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
 
-    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) :
-                          ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info);
+    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info) :
+           ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info);
 }
 
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info)
+void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -86,14 +86,14 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
         _matmul_lowp_native_kernel->set_target(gpu_target);
 
         // Configure the low-precision native matrix multiply kernel
-        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
     }
     else
     {
         _matmul_native_kernel->set_target(gpu_target);
 
         // Configure the native matrix multiply kernel
-        _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+        _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
     }
 }
 
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 6aba801301..9dce5288e6 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
-#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
 
 #include "arm_compute/core/ActivationLayerInfo.h"
 #include "arm_compute/core/MatMulInfo.h"
 #include "src/gpu/cl/IClOperator.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <memory>
 
@@ -71,24 +71,26 @@ public:
      * @param[in]  rhs             Right-hand side tensor info. Data types supported: same as @p lhs.
      * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  act_info        Class containing information about fused activation function.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{nullptr};
-    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{ nullptr };
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
 
     bool _is_quantized{ false };
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */
+#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index ae5a01f679..bef422fca1 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -42,25 +42,26 @@ CLMatMul::CLMatMul()
 
 CLMatMul::~CLMatMul() = default;
 
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(settings);
-    configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info);
+    configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
 }
 
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
     ARM_COMPUTE_UNUSED(settings);
 
     _impl->op = std::make_unique<OperatorType>();
-    _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info);
+    _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
     _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
 }
 
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info)
+Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
 {
-    return OperatorType::validate(lhs, rhs, output, matmul_info);
+    return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
 }
 
 void CLMatMul::run()
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 0c46516f1e..58640f40ea 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
@@ -49,7 +49,7 @@ NEMatMul::NEMatMul()
 
 NEMatMul::~NEMatMul() = default;
 
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
     _impl->lhs    = lhs;
     _impl->rhs    = rhs;
@@ -57,14 +57,14 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
     _impl->op = std::make_unique<cpu::CpuMatMul>();
-    _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings);
+    _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
     _impl->run_pack          = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
 {
-    return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings);
+    return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
 }
 
 void NEMatMul::run()
author	Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>	2023-05-25 16:48:43 +0100
committer	Mohmun02 <MohammedSuhail.Munshi@arm.com>	2023-06-16 15:38:39 +0000
commit	94abde4f4e98f6f1adb5c46b194527f34a8ea07d (patch)
tree	d6d717031788850d970fb44ff3f41de311cc5fc0 /src
parent	dd8d7f4102653ef55d872c71ae5d5f2ca2ead0c1 (diff)
download	ComputeLibrary-94abde4f4e98f6f1adb5c46b194527f34a8ea07d.tar.gz