aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>2023-05-25 16:48:43 +0100
committerMohmun02 <MohammedSuhail.Munshi@arm.com>2023-06-16 15:38:39 +0000
commit94abde4f4e98f6f1adb5c46b194527f34a8ea07d (patch)
treed6d717031788850d970fb44ff3f41de311cc5fc0 /src
parentdd8d7f4102653ef55d872c71ae5d5f2ca2ead0c1 (diff)
downloadComputeLibrary-94abde4f4e98f6f1adb5c46b194527f34a8ea07d.tar.gz
Add Fused Activation to OpenCL MatMul
- Added fused activation to MatMul function interface - Added fused activation to CL backend - Includes tests for supported Activation Functions in MatMul Resolves: [COMPMID-6192] Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: Ie103212b600b60699eaf6a6394d609e6e1f5aba6 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/522465 Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9714 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul.cl27
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul_quantized.cl15
-rw-r--r--src/cpu/operators/CpuMatMul.cpp14
-rw-r--r--src/cpu/operators/CpuMatMul.h6
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp50
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h21
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.cpp17
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.h20
-rw-r--r--src/gpu/cl/operators/ClMatMul.cpp16
-rw-r--r--src/gpu/cl/operators/ClMatMul.h18
-rw-r--r--src/runtime/CL/functions/CLMatMul.cpp13
-rw-r--r--src/runtime/NEON/functions/NEMatMul.cpp10
12 files changed, 137 insertions, 90 deletions
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
index 90d485e815..9656a59728 100644
--- a/src/core/CL/cl_kernels/common/mat_mul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -21,6 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "activation_float_helpers.h"
#include "helpers.h"
#include "tile_helpers.h"
@@ -31,6 +32,7 @@
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -86,7 +88,7 @@ __kernel void mat_mul_native_nt_nt(
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, M0, K0, a);
@@ -111,7 +113,7 @@ __kernel void mat_mul_native_nt_nt(
lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
}
-#ifdef K % K0 != 0
+#if K % K0 != 0
/* Leftover Loop */
for(; k < K; ++k)
{
@@ -147,6 +149,8 @@ __kernel void mat_mul_native_nt_nt(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_NT_NT)
@@ -158,6 +162,7 @@ __kernel void mat_mul_native_nt_nt(
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -213,7 +218,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, M0, K0, a);
@@ -301,6 +306,8 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_NT_T)
@@ -312,6 +319,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -367,7 +375,7 @@ __kernel void mat_mul_native_t_nt(
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, K0, M0, a);
@@ -405,7 +413,7 @@ __kernel void mat_mul_native_t_nt(
lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
}
-#ifdef K % K0 != 0
+#if K % K0 != 0
/* Leftover Loop */
for(; k < K; ++k)
{
@@ -451,6 +459,8 @@ __kernel void mat_mul_native_t_nt(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_T_NT)
@@ -462,6 +472,7 @@ __kernel void mat_mul_native_t_nt(
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -517,7 +528,7 @@ __kernel void mat_mul_native_t_t(
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, K0, M0, a);
@@ -565,7 +576,7 @@ __kernel void mat_mul_native_t_t(
lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
}
-#ifdef K % K0 != 0
+#if K % K0 != 0
/* Leftover Loop */
for(; k < K; ++k)
{
@@ -619,6 +630,8 @@ __kernel void mat_mul_native_t_t(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
index 0c3cbca9a6..bd415bb4a7 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -23,6 +23,7 @@
*/
#include "helpers.h"
#include "tile_helpers.h"
+#include "activation_float_helpers.h"
#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
@@ -32,6 +33,7 @@
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -194,6 +196,8 @@ __kernel void mat_mul_native_quantized_nt_nt(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -216,6 +220,7 @@ __kernel void mat_mul_native_quantized_nt_nt(
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -315,7 +320,7 @@ __kernel void mat_mul_native_quantized_nt_t(
rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
}
-#if ((K % K0) != 0)
+#if((K % K0) != 0)
// Leftover loop
for(; k < K; ++k)
{
@@ -370,6 +375,8 @@ __kernel void mat_mul_native_quantized_nt_t(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -392,6 +399,7 @@ __kernel void mat_mul_native_quantized_nt_t(
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -548,6 +556,8 @@ __kernel void mat_mul_native_quantized_t_nt(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -570,6 +580,7 @@ __kernel void mat_mul_native_quantized_t_nt(
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -731,6 +742,8 @@ __kernel void mat_mul_native_quantized_t_t(
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
// Quantize the tile
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 87cb6c6b54..515b511044 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -25,9 +25,9 @@
#include "src/cpu/operators/CpuMatMul.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
#include "src/common/utils/Log.h"
@@ -45,7 +45,6 @@ namespace cpu
{
namespace
{
-
Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
{
@@ -74,15 +73,14 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
return Status{};
}
-
-}
+} // namespace
CpuMatMul::CpuMatMul()
: _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
{
}
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -100,7 +98,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
TensorInfo rhs_transposed{};
auto gemm_info = AsmGemmInfo();
- gemm_info.activation_info = info.fused_activation();
+ gemm_info.activation_info = act_info;
gemm_info.fast_mode = settings.fast_math();
// Validate and then permute a/b
@@ -139,7 +137,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
return Status{};
}
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -189,7 +187,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
// -----------------------------------------------------
// Use transposed tensors if the corresponding transpose flags are set
// Fill AsmGemmInfo class object before configuration
- _gemm_info.activation_info = info.fused_activation();
+ _gemm_info.activation_info = act_info;
_gemm_info.fast_mode = settings.fast_math();
_gemm_info.negated_offsets = false;
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 9f5833b24f..475c019fd0 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -64,15 +64,17 @@ public:
* @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
* @param[in] info Contains MatMul operation information described in @ref MatMulInfo.
* @param[in] settings The settings for matmul operation (i.e fast math)
+ * @param[in] act_info Class containing information about fused activation function.
*/
- void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+ void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index d5ecdf7dd2..9bbec908a3 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -98,34 +98,36 @@ ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
{
_type = CLKernelType::GEMM;
}
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
- if(output->total_size() != 0)
+ if(dst->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
+ const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
}
return Status{};
}
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info));
// output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
- const int m = output->dimension(1);
- const int n = output->dimension(0);
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
const bool adj_lhs = matmul_kernel_info.adj_lhs;
@@ -133,7 +135,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
// Configure kernel window
- Window win = calculate_max_window(*output, Steps(n0, m0));
+ Window win = calculate_max_window(*dst, Steps(n0, m0));
win = win.collapse(win, Window::DimZ);
IClKernel::configure_internal(win);
@@ -152,7 +154,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
- const UniformQuantizationInfo dqinfo = output->quantization_info().uniform();
+ const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale;
int output_multiplier = 0;
@@ -166,6 +168,10 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D
build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset)); // Passed as positive (unlike the above two)
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
std::string kernel_name("mat_mul_native_quantized");
kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -177,7 +183,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set config_id for enabling LWS tuning
- const size_t number_of_batches = output->tensor_shape().total_size() / (m * n);
+ const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
_config_id = kernel_name;
_config_id += "_";
@@ -203,18 +209,18 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
- ICLTensor *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
+ const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
unsigned int idx = 0;
Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
add_3d_tensor_nhw_argument(idx, lhs);
add_3d_tensor_nhw_argument(idx, rhs);
- add_3d_tensor_nhw_argument(idx, output);
+ add_3d_tensor_nhw_argument(idx, dst);
enqueue(queue, *this, window_collapsed, lws_hint());
}
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index d70ff30b91..67d1a6601f 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -24,6 +24,7 @@
#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
+#include "arm_compute/core/ActivationLayerInfo.h"
#include "src/core/common/Macros.h"
#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/IClKernel.h"
@@ -43,22 +44,24 @@ public:
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel);
/** Initialise the kernel's input and output.
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] lhs Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[out] dst Output tensor info. Data type supported: same as @p lhs
- * @param[in] matmul_info Attributes for Batch MatMul Kernel
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
+ * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel
+ * @param[in] act_info Class containing information about fused activation function.
*/
- void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMulLowpNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 8f53c1998f..205396a639 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -112,7 +112,7 @@ Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInf
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
}
- return Status {};
+ return Status{};
}
}
ClMatMulNativeKernel::ClMatMulNativeKernel()
@@ -120,8 +120,9 @@ ClMatMulNativeKernel::ClMatMulNativeKernel()
_type = CLKernelType::GEMM;
}
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
{
+ ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -138,7 +139,8 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
return Status{};
}
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
@@ -176,6 +178,11 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
build_opts.add_option("-DK=" + support::cpp11::to_string(k));
build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER");
+ // Define values for activation function
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
std::string kernel_name("mat_mul_native");
kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -218,8 +225,8 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index f706256e31..02d8ac3067 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -42,22 +42,24 @@ public:
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel);
/** Initialise the kernel's input and output.
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[out] dst Output tensor info. Data type supported: same as @p lhs
- * @param[in] matmul_info Attributes for Batch MatMul Kernel
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
+ * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel
+ * @param[in] act_info Specifies activation function to use after Matrix multiplication. Default is Identity function.
*/
- void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMulNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 3822c16aa1..c453761a8e 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -47,7 +47,7 @@ ClMatMul::ClMatMul()
{
}
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info)
+Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
@@ -57,15 +57,15 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
- MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
+ const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
- bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+ const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
- return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) :
- ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info);
+ return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info) :
+ ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info);
}
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info)
+void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -86,14 +86,14 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
_matmul_lowp_native_kernel->set_target(gpu_target);
// Configure the low-precision native matrix multiply kernel
- _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+ _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
}
else
{
_matmul_native_kernel->set_target(gpu_target);
// Configure the native matrix multiply kernel
- _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+ _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
}
}
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 6aba801301..9dce5288e6 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,14 +21,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
-#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
#include "arm_compute/core/ActivationLayerInfo.h"
#include "arm_compute/core/MatMulInfo.h"
#include "src/gpu/cl/IClOperator.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include <memory>
@@ -71,24 +71,26 @@ public:
* @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
* @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
* @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] act_info Class containing information about fused activation function.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info);
+ void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
private:
- std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{nullptr};
- std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
+ std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{ nullptr };
+ std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
bool _is_quantized{ false };
};
} // namespace opencl
} // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */
+#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index ae5a01f679..bef422fca1 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -42,25 +42,26 @@ CLMatMul::CLMatMul()
CLMatMul::~CLMatMul() = default;
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(settings);
- configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info);
+ configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
}
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
ARM_COMPUTE_UNUSED(settings);
_impl->op = std::make_unique<OperatorType>();
- _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info);
+ _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
_impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
}
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info)
+Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
{
- return OperatorType::validate(lhs, rhs, output, matmul_info);
+ return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
}
void CLMatMul::run()
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 0c46516f1e..58640f40ea 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -25,9 +25,9 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/runtime/Tensor.h"
namespace arm_compute
{
@@ -49,7 +49,7 @@ NEMatMul::NEMatMul()
NEMatMul::~NEMatMul() = default;
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
_impl->lhs = lhs;
_impl->rhs = rhs;
@@ -57,14 +57,14 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM
ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
_impl->op = std::make_unique<cpu::CpuMatMul>();
- _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings);
+ _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
_impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
- return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings);
+ return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
}
void NEMatMul::run()