aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>2023-05-25 16:48:43 +0100
committerMohmun02 <MohammedSuhail.Munshi@arm.com>2023-06-16 15:38:39 +0000
commit94abde4f4e98f6f1adb5c46b194527f34a8ea07d (patch)
treed6d717031788850d970fb44ff3f41de311cc5fc0
parentdd8d7f4102653ef55d872c71ae5d5f2ca2ead0c1 (diff)
downloadComputeLibrary-94abde4f4e98f6f1adb5c46b194527f34a8ea07d.tar.gz
Add Fused Activation to OpenCL MatMul
- Added fused activation to MatMul function interface - Added fused activation to CL backend - Includes tests for supported Activation Functions in MatMul Resolves: [COMPMID-6192] Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: Ie103212b600b60699eaf6a6394d609e6e1f5aba6 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/522465 Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9714 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/MatMulInfo.h12
-rw-r--r--arm_compute/runtime/CL/functions/CLMatMul.h22
-rw-r--r--arm_compute/runtime/NEON/functions/NEMatMul.h17
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul.cl27
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul_quantized.cl15
-rw-r--r--src/cpu/operators/CpuMatMul.cpp14
-rw-r--r--src/cpu/operators/CpuMatMul.h6
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp50
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h21
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.cpp17
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.h20
-rw-r--r--src/gpu/cl/operators/ClMatMul.cpp16
-rw-r--r--src/gpu/cl/operators/ClMatMul.h18
-rw-r--r--src/runtime/CL/functions/CLMatMul.cpp13
-rw-r--r--src/runtime/NEON/functions/NEMatMul.cpp10
-rw-r--r--tests/validation/CL/MatMul.cpp182
-rw-r--r--tests/validation/fixtures/MatMulFixture.h60
-rw-r--r--utils/TypePrinter.h6
18 files changed, 335 insertions, 191 deletions
diff --git a/arm_compute/core/MatMulInfo.h b/arm_compute/core/MatMulInfo.h
index 62d782215b..01b9b47761 100644
--- a/arm_compute/core/MatMulInfo.h
+++ b/arm_compute/core/MatMulInfo.h
@@ -58,11 +58,6 @@ public:
{
return _adj_rhs;
}
- /* Get Fused Activation Layer Info */
- ActivationLayerInfo fused_activation() const
- {
- return _fused_act;
- }
/* Set Adjoint LHS flag */
MatMulInfo &adj_lhs(bool adj_lhs)
{
@@ -75,17 +70,10 @@ public:
_adj_rhs = adj_rhs;
return *this;
}
- /* Set Fused Activation Layer Info */
- MatMulInfo &fused_activation(const ActivationLayerInfo &act_info)
- {
- _fused_act = act_info;
- return *this;
- }
private:
bool _adj_lhs{ false };
bool _adj_rhs{ false };
- ActivationLayerInfo _fused_act{}; // disabled by default
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_MATMULINFO_H */
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
index 2af9a4a9a6..a11c1ed6a2 100644
--- a/arm_compute/runtime/CL/functions/CLMatMul.h
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -24,6 +24,8 @@
#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#include "arm_compute/core/ActivationLayerInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -83,21 +85,29 @@ public:
* @param[in] rhs Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs.
* @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
* @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
- * @param[in] settings Class containing flags for function level settings
+ * @param[in] settings Contains flags for function level settings
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
*/
- void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+ void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const
+ ActivationLayerInfo &act_info = ActivationLayerInfo{});
/** Initialise the kernel's inputs and output
*
* Similar to @ref CLMatMul::configure()
*/
- void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+ void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const ActivationLayerInfo &act_info =
+ ActivationLayerInfo{});
/** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
*
- * Similar to @ref CLMatMul::configure()
*
- * @return a status
+ * @note All tensors must have the same data type.
+ *
+ * @param[in] lhs Left-hand side (Matrix A) tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side (Matrix B) tensor info. Data types supported: same as @p lhs.
+ * @param[out] output Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+ * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo{});
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
index a331c55a98..81fec19f86 100644
--- a/arm_compute/runtime/NEON/functions/NEMatMul.h
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -24,6 +24,8 @@
#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
+#include "arm_compute/core/ActivationLayerInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
#include <memory>
@@ -91,16 +93,23 @@ public:
* @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
* @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
* @param[in] info Contains MatMul operation information described in @ref MatMulInfo.
- * @param[in] settings Class containing flags for function level settings i.e fast math
+ * @param[in] settings Contains flags for function level settings i.e fast math
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
*/
- void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+ void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
*
- * Parameters are similar to @ref NEMatMul::configure()
+ * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
+ * @param[out] dst Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+ * @param[in] info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] settings Contains flags for function level settings i.e fast math
+ * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
*
* @return Status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden
void run() override;
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
index 90d485e815..9656a59728 100644
--- a/src/core/CL/cl_kernels/common/mat_mul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -21,6 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "activation_float_helpers.h"
#include "helpers.h"
#include "tile_helpers.h"
@@ -31,6 +32,7 @@
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -86,7 +88,7 @@ __kernel void mat_mul_native_nt_nt(
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, M0, K0, a);
@@ -111,7 +113,7 @@ __kernel void mat_mul_native_nt_nt(
lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
}
-#ifdef K % K0 != 0
+#if K % K0 != 0
/* Leftover Loop */
for(; k < K; ++k)
{
@@ -147,6 +149,8 @@ __kernel void mat_mul_native_nt_nt(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_NT_NT)
@@ -158,6 +162,7 @@ __kernel void mat_mul_native_nt_nt(
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -213,7 +218,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, M0, K0, a);
@@ -301,6 +306,8 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_NT_T)
@@ -312,6 +319,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -367,7 +375,7 @@ __kernel void mat_mul_native_t_nt(
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, K0, M0, a);
@@ -405,7 +413,7 @@ __kernel void mat_mul_native_t_nt(
lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
}
-#ifdef K % K0 != 0
+#if K % K0 != 0
/* Leftover Loop */
for(; k < K; ++k)
{
@@ -451,6 +459,8 @@ __kernel void mat_mul_native_t_nt(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_T_NT)
@@ -462,6 +472,7 @@ __kernel void mat_mul_native_t_nt(
* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
@@ -517,7 +528,7 @@ __kernel void mat_mul_native_t_t(
})
const int rhs_z = z * rhs_h;
- int k;
+ int k;
for(k = 0; k <= K - K0; k += K0)
{
TILE(DATA_TYPE, K0, M0, a);
@@ -565,7 +576,7 @@ __kernel void mat_mul_native_t_t(
lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
}
-#ifdef K % K0 != 0
+#if K % K0 != 0
/* Leftover Loop */
for(; k < K; ++k)
{
@@ -619,6 +630,8 @@ __kernel void mat_mul_native_t_t(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+ T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
}
#endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
index 0c3cbca9a6..bd415bb4a7 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -23,6 +23,7 @@
*/
#include "helpers.h"
#include "tile_helpers.h"
+#include "activation_float_helpers.h"
#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
@@ -32,6 +33,7 @@
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -194,6 +196,8 @@ __kernel void mat_mul_native_quantized_nt_nt(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -216,6 +220,7 @@ __kernel void mat_mul_native_quantized_nt_nt(
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -315,7 +320,7 @@ __kernel void mat_mul_native_quantized_nt_t(
rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
}
-#if ((K % K0) != 0)
+#if((K % K0) != 0)
// Leftover loop
for(; k < K; ++k)
{
@@ -370,6 +375,8 @@ __kernel void mat_mul_native_quantized_nt_t(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -392,6 +399,7 @@ __kernel void mat_mul_native_quantized_nt_t(
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -548,6 +556,8 @@ __kernel void mat_mul_native_quantized_t_nt(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -570,6 +580,7 @@ __kernel void mat_mul_native_quantized_t_nt(
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
* @note Only the following configurations of M0, N0 and K0 are currently supported:
@@ -731,6 +742,8 @@ __kernel void mat_mul_native_quantized_t_t(
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
// Quantize the tile
+ T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 87cb6c6b54..515b511044 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -25,9 +25,9 @@
#include "src/cpu/operators/CpuMatMul.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
#include "src/common/utils/Log.h"
@@ -45,7 +45,6 @@ namespace cpu
{
namespace
{
-
Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
{
@@ -74,15 +73,14 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
return Status{};
}
-
-}
+} // namespace
CpuMatMul::CpuMatMul()
: _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
{
}
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -100,7 +98,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
TensorInfo rhs_transposed{};
auto gemm_info = AsmGemmInfo();
- gemm_info.activation_info = info.fused_activation();
+ gemm_info.activation_info = act_info;
gemm_info.fast_mode = settings.fast_math();
// Validate and then permute a/b
@@ -139,7 +137,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
return Status{};
}
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -189,7 +187,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
// -----------------------------------------------------
// Use transposed tensors if the corresponding transpose flags are set
// Fill AsmGemmInfo class object before configuration
- _gemm_info.activation_info = info.fused_activation();
+ _gemm_info.activation_info = act_info;
_gemm_info.fast_mode = settings.fast_math();
_gemm_info.negated_offsets = false;
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 9f5833b24f..475c019fd0 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -64,15 +64,17 @@ public:
* @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
* @param[in] info Contains MatMul operation information described in @ref MatMulInfo.
* @param[in] settings The settings for matmul operation (i.e fast math)
+ * @param[in] act_info Class containing information about fused activation function.
*/
- void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+ void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index d5ecdf7dd2..9bbec908a3 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -98,34 +98,36 @@ ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
{
_type = CLKernelType::GEMM;
}
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
- if(output->total_size() != 0)
+ if(dst->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
+ const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
}
return Status{};
}
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info));
// output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
- const int m = output->dimension(1);
- const int n = output->dimension(0);
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
const bool adj_lhs = matmul_kernel_info.adj_lhs;
@@ -133,7 +135,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
// Configure kernel window
- Window win = calculate_max_window(*output, Steps(n0, m0));
+ Window win = calculate_max_window(*dst, Steps(n0, m0));
win = win.collapse(win, Window::DimZ);
IClKernel::configure_internal(win);
@@ -152,7 +154,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
- const UniformQuantizationInfo dqinfo = output->quantization_info().uniform();
+ const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale;
int output_multiplier = 0;
@@ -166,6 +168,10 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D
build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset)); // Passed as positive (unlike the above two)
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
std::string kernel_name("mat_mul_native_quantized");
kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -177,7 +183,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Set config_id for enabling LWS tuning
- const size_t number_of_batches = output->tensor_shape().total_size() / (m * n);
+ const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
_config_id = kernel_name;
_config_id += "_";
@@ -203,18 +209,18 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
- ICLTensor *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
+ const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
unsigned int idx = 0;
Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
add_3d_tensor_nhw_argument(idx, lhs);
add_3d_tensor_nhw_argument(idx, rhs);
- add_3d_tensor_nhw_argument(idx, output);
+ add_3d_tensor_nhw_argument(idx, dst);
enqueue(queue, *this, window_collapsed, lws_hint());
}
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index d70ff30b91..67d1a6601f 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -24,6 +24,7 @@
#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
+#include "arm_compute/core/ActivationLayerInfo.h"
#include "src/core/common/Macros.h"
#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/IClKernel.h"
@@ -43,22 +44,24 @@ public:
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel);
/** Initialise the kernel's input and output.
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] lhs Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[out] dst Output tensor info. Data type supported: same as @p lhs
- * @param[in] matmul_info Attributes for Batch MatMul Kernel
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
+ * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel
+ * @param[in] act_info Class containing information about fused activation function.
*/
- void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMulLowpNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 8f53c1998f..205396a639 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -112,7 +112,7 @@ Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInf
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
}
- return Status {};
+ return Status{};
}
}
ClMatMulNativeKernel::ClMatMulNativeKernel()
@@ -120,8 +120,9 @@ ClMatMulNativeKernel::ClMatMulNativeKernel()
_type = CLKernelType::GEMM;
}
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
{
+ ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -138,7 +139,8 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
return Status{};
}
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
@@ -176,6 +178,11 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
build_opts.add_option("-DK=" + support::cpp11::to_string(k));
build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER");
+ // Define values for activation function
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+ build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
std::string kernel_name("mat_mul_native");
kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
@@ -218,8 +225,8 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
- const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index f706256e31..02d8ac3067 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -42,22 +42,24 @@ public:
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel);
/** Initialise the kernel's input and output.
*
- * @param[in] compile_context The compile context to be used.
- * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
- * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[out] dst Output tensor info. Data type supported: same as @p lhs
- * @param[in] matmul_info Attributes for Batch MatMul Kernel
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
+ * Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
+ * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel
+ * @param[in] act_info Specifies activation function to use after Matrix multiplication. Default is Identity function.
*/
- void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMulNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 3822c16aa1..c453761a8e 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -47,7 +47,7 @@ ClMatMul::ClMatMul()
{
}
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info)
+Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
@@ -57,15 +57,15 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
- MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
+ const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
- bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+ const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
- return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) :
- ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info);
+ return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info) :
+ ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info);
}
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info)
+void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -86,14 +86,14 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
_matmul_lowp_native_kernel->set_target(gpu_target);
// Configure the low-precision native matrix multiply kernel
- _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+ _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
}
else
{
_matmul_native_kernel->set_target(gpu_target);
// Configure the native matrix multiply kernel
- _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+ _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info);
}
}
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 6aba801301..9dce5288e6 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,14 +21,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
-#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
#include "arm_compute/core/ActivationLayerInfo.h"
#include "arm_compute/core/MatMulInfo.h"
#include "src/gpu/cl/IClOperator.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include <memory>
@@ -71,24 +71,26 @@ public:
* @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
* @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
* @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+ * @param[in] act_info Class containing information about fused activation function.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info);
+ void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
private:
- std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{nullptr};
- std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
+ std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{ nullptr };
+ std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
bool _is_quantized{ false };
};
} // namespace opencl
} // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */
+#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index ae5a01f679..bef422fca1 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -42,25 +42,26 @@ CLMatMul::CLMatMul()
CLMatMul::~CLMatMul() = default;
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(settings);
- configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info);
+ configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
}
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings)
+void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
ARM_COMPUTE_UNUSED(settings);
_impl->op = std::make_unique<OperatorType>();
- _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info);
+ _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
_impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
}
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info)
+Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
{
- return OperatorType::validate(lhs, rhs, output, matmul_info);
+ return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
}
void CLMatMul::run()
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 0c46516f1e..58640f40ea 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -25,9 +25,9 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/runtime/Tensor.h"
namespace arm_compute
{
@@ -49,7 +49,7 @@ NEMatMul::NEMatMul()
NEMatMul::~NEMatMul() = default;
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
_impl->lhs = lhs;
_impl->rhs = rhs;
@@ -57,14 +57,14 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM
ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
_impl->op = std::make_unique<cpu::CpuMatMul>();
- _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings);
+ _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
_impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings)
+Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
{
- return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings);
+ return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
}
void NEMatMul::run()
diff --git a/tests/validation/CL/MatMul.cpp b/tests/validation/CL/MatMul.cpp
index 6364b16200..5a262a8e78 100644
--- a/tests/validation/CL/MatMul.cpp
+++ b/tests/validation/CL/MatMul.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/runtime/CL/functions/CLMatMul.h"
#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ActivationFunctionsDataset.h"
#include "tests/framework/DatasetModes.h"
#include "tests/framework/Macros.h"
#include "tests/framework/TestCase.h"
@@ -44,11 +45,13 @@ namespace validation
{
namespace
{
-RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
-constexpr float abs_tolerance_f32(0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
-constexpr float abs_tolerance_f16(0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data type in case using relative tolerance fails because of small values */
-RelativeTolerance<half_float::half> tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
-constexpr AbsoluteTolerance<uint8_t> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+constexpr float abs_tolerance_f32(
+ 0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+ 0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data type in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half> tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+constexpr AbsoluteTolerance<uint8_t> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
} // namespace
template <typename T>
@@ -57,25 +60,71 @@ using CLMatMulFixture = MatMulValidationFixture<CLTensor, CLAccessor, CLMatMul,
template <typename T>
using CLQuantizedMatMulFixture = QuantizedMatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+template <typename T>
+using CLMatMulActivationFixture = MatMulValidationWithActivationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLMatMulActivationAlphaBetaFixture = MatMulValidationWithActivationAlphaBetaFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLQuantizedMatMulActivationFixture = QuantizedMatMulValidationWithActivationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+/* The main act functions matmul is expected to support */
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+ ActivationLayerInfo(),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH)
+});
+
+const auto ActivationFunctionsQuantizedDataset = concat(concat(concat(
+ framework::dataset::make("ActivationInfo", ActivationLayerInfo()),
+ framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
+ framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f))),
+ framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f)));
+
+/* Larger activation functions dataset, used during some nightly tests. */
+const auto AllActivationsDataset = combine(datasets::ActivationFunctions(), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
+const auto AllQuantizedActivationsDataset = combine(concat(datasets::ActivationFunctionsQuantized(),
+ framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+ ActivationLayerInfo::ActivationFunction::LEAKY_RELU
+ })),
+ framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
TEST_SUITE(CL)
TEST_SUITE(MatMul)
TEST_SUITE(Float)
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::F32)),
+ ActivationFunctionsDataset))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::F32)))
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::F32)),
+ ActivationFunctionsDataset))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunAllActivations, CLMatMulActivationAlphaBetaFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallerMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ framework::dataset::make("DataType", DataType::F32)),
+ AllActivationsDataset))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
@@ -85,19 +134,21 @@ TEST_SUITE_END() // FP32
TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::F16)),
+ ActivationFunctionsDataset))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::F16)),
+ ActivationFunctionsDataset))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
@@ -110,32 +161,30 @@ TEST_SUITE(Quantized)
TEST_SUITE(QASYMM8)
FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
- datasets::SmallMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::QASYMM8)),
- framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
- framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
- framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
- framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
- framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+ datasets::SmallMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::QASYMM8)),
+ ActivationFunctionsQuantizedDataset),
+ framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+ framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+ framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+ framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_quant);
}
FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
- datasets::LargeMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::QASYMM8)),
- framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
- framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
- framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
- framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
- framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+ datasets::LargeMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::QASYMM8)),
+ ActivationFunctionsQuantizedDataset),
+ framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+ framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+ framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+ framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -146,32 +195,45 @@ TEST_SUITE_END() // QASYMM8
TEST_SUITE(QASYMM8_SIGNED)
FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
- datasets::SmallMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
- framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
- framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
- framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
- framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
- framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+ datasets::SmallMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+ ActivationFunctionsQuantizedDataset),
+ framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+ framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+ framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+ framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_quant);
}
FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
- datasets::LargeMatMulDataset(),
- framework::dataset::make("TransposeA", { false, true })),
- framework::dataset::make("TransposeB", { false, true })),
- framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
- framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
- framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
- framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
- framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
- framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
-)
+ datasets::LargeMatMulDataset(),
+ framework::dataset::make("TransposeA", { false, true })),
+ framework::dataset::make("TransposeB", { false, true })),
+ framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+ ActivationFunctionsQuantizedDataset),
+ framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+ framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+ framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+ framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunAllActivations, CLQuantizedMatMulActivationFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
+ datasets::LargeMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+ AllQuantizedActivationsDataset),
+ framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+ framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+ framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+ framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_quant);
diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h
index 2f94c1f9d2..3e4cac5e34 100644
--- a/tests/validation/fixtures/MatMulFixture.h
+++ b/tests/validation/fixtures/MatMulFixture.h
@@ -112,14 +112,14 @@ protected:
// Configure MatMulInfo class
MatMulInfo mm_info;
- mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b).fused_activation(act_info);
+ mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b);
// Ensure values are dynamic
a.info()->set_are_values_constant(false);
b.info()->set_are_values_constant(false);
// Configure operator
- matmul.configure(&a, &b, &dst, mm_info, settings);
+ matmul.configure(&a, &b, &dst, mm_info, settings, act_info);
// Assertions
ARM_COMPUTE_ASSERT(a.info()->is_resizable());
@@ -162,8 +162,8 @@ protected:
}
template <typename TT>
- typename std::enable_if<!std::is_integral<TT>::value, SimpleTensor<TT>>::type
- compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
+ typename std::enable_if < !std::is_integral<TT>::value, SimpleTensor<TT >>::type
+ compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
{
ARM_COMPUTE_UNUSED(o_qinfo);
@@ -172,7 +172,7 @@ protected:
template <typename TT>
typename std::enable_if<std::is_integral<TT>::value, SimpleTensor<TT>>::type
- compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
+ compute_reference_gemm(const SimpleTensor<TT> &a, const SimpleTensor<TT> &b, const SimpleTensor<TT> &c, float alpha, float beta, const QuantizationInfo &o_qinfo)
{
ARM_COMPUTE_UNUSED(alpha, beta);
@@ -183,18 +183,18 @@ protected:
const auto multiplier = aq.scale * bq.scale / oq.scale;
int32_t output_multiplier = 0;
- int32_t output_shift = 0;
+ int32_t output_shift = 0;
quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
std::vector<int32_t> output_multipliers{ output_multiplier };
std::vector<int32_t> output_shifts{ output_shift };
//The lhs and rhs offsets are negated here to keep the reference aligned with the function implementation where the lhs and rhs offsets are also negated.
const auto tmp = reference::gemmlowp_matrix_multiply_core<int32_t>(
- a, b, c.shape(), -aq.offset, -bq.offset);
+ a, b, c.shape(), -aq.offset, -bq.offset);
auto output = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TT>(
- tmp, output_multipliers, output_shifts, oq.offset,
- std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
+ tmp, output_multipliers, output_shifts, oq.offset,
+ std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
output.quantization_info(o_qinfo);
return output;
@@ -280,6 +280,30 @@ public:
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulValidationWithDynamicTensorsFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+ template <typename...>
+ void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs)
+ {
+ MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings());
+ }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class QuantizedMatMulValidationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+ template <typename...>
+ void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs,
+ QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo)
+ {
+ MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(),
+ a_qinfo, b_qinfo, o_qinfo);
+ }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
class MatMulValidationWithActivationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
{
public:
@@ -291,24 +315,30 @@ public:
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
-class MatMulValidationWithDynamicTensorsFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+class MatMulValidationWithActivationAlphaBetaFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
{
public:
template <typename...>
- void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs)
+ void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo::ActivationFunction function,
+ float alpha_beta)
{
- MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings());
+ ActivationLayerInfo act_info(function, alpha_beta, alpha_beta);
+ MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, 0, Settings());
}
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
-class QuantizedMatMulValidationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+class QuantizedMatMulValidationWithActivationFixture : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
{
public:
template <typename...>
- void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs, QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo)
+ void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo::ActivationFunction function,
+ float alpha_beta, int num_extra_runs,
+ QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo)
{
- MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(), a_qinfo, b_qinfo, o_qinfo);
+ ActivationLayerInfo act_info(function, alpha_beta, alpha_beta);
+ MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(),
+ a_qinfo, b_qinfo, o_qinfo);
}
};
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 0df320d7e0..de8a960e41 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -35,11 +35,11 @@
#include "arm_compute/core/GEMMInfo.h"
#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/MatMulInfo.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Strides.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/MatMulInfo.h"
#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/core/experimental/PostOps.h"
#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
@@ -3691,9 +3691,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::MatMulI
os << "MatMulKernelInfo="
<< "["
<< "adj_lhs=" << matmul_info.adj_lhs() << ", "
- << "adj_rhs=" << matmul_info.adj_rhs() << ", "
- << "fused_activation=" << matmul_info.fused_activation().activation() << "]";
-
+ << "adj_rhs=" << matmul_info.adj_rhs() << "] ";
return os;
}
/** Formatted output of the arm_compute::MatMulInfo type.