From 94abde4f4e98f6f1adb5c46b194527f34a8ea07d Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Thu, 25 May 2023 16:48:43 +0100 Subject: Add Fused Activation to OpenCL MatMul - Added fused activation to MatMul function interface - Added fused activation to CL backend - Includes tests for supported Activation Functions in MatMul Resolves: [COMPMID-6192] Signed-off-by: Mohammed Suhail Munshi Change-Id: Ie103212b600b60699eaf6a6394d609e6e1f5aba6 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/522465 Comments-Addressed: bsgcomp Reviewed-by: Viet-Hoa Do Tested-by: bsgcomp Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9714 Comments-Addressed: Arm Jenkins Reviewed-by: Jakub Sujak Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- arm_compute/core/MatMulInfo.h | 12 -- arm_compute/runtime/CL/functions/CLMatMul.h | 22 ++- arm_compute/runtime/NEON/functions/NEMatMul.h | 17 +- src/core/CL/cl_kernels/common/mat_mul.cl | 27 ++- src/core/CL/cl_kernels/common/mat_mul_quantized.cl | 15 +- src/cpu/operators/CpuMatMul.cpp | 14 +- src/cpu/operators/CpuMatMul.h | 6 +- src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp | 50 +++--- src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h | 21 ++- src/gpu/cl/kernels/ClMatMulNativeKernel.cpp | 17 +- src/gpu/cl/kernels/ClMatMulNativeKernel.h | 20 ++- src/gpu/cl/operators/ClMatMul.cpp | 16 +- src/gpu/cl/operators/ClMatMul.h | 18 +- src/runtime/CL/functions/CLMatMul.cpp | 13 +- src/runtime/NEON/functions/NEMatMul.cpp | 10 +- tests/validation/CL/MatMul.cpp | 182 ++++++++++++++------- tests/validation/fixtures/MatMulFixture.h | 60 +++++-- utils/TypePrinter.h | 6 +- 18 files changed, 335 insertions(+), 191 deletions(-) diff --git a/arm_compute/core/MatMulInfo.h b/arm_compute/core/MatMulInfo.h index 62d782215b..01b9b47761 100644 --- a/arm_compute/core/MatMulInfo.h +++ b/arm_compute/core/MatMulInfo.h @@ -58,11 +58,6 @@ public: { return _adj_rhs; } - /* Get Fused Activation Layer Info */ - ActivationLayerInfo fused_activation() const - { - return _fused_act; - } /* Set Adjoint LHS flag */ MatMulInfo &adj_lhs(bool adj_lhs) { @@ -75,17 +70,10 @@ public: _adj_rhs = adj_rhs; return *this; } - /* Set Fused Activation Layer Info */ - MatMulInfo &fused_activation(const ActivationLayerInfo &act_info) - { - _fused_act = act_info; - return *this; - } private: bool _adj_lhs{ false }; bool _adj_rhs{ false }; - ActivationLayerInfo _fused_act{}; // disabled by default }; } // namespace arm_compute #endif /* ARM_COMPUTE_MATMULINFO_H */ diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h index 2af9a4a9a6..a11c1ed6a2 100644 --- a/arm_compute/runtime/CL/functions/CLMatMul.h +++ b/arm_compute/runtime/CL/functions/CLMatMul.h @@ -24,6 +24,8 @@ #ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL #define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL +#include "arm_compute/core/ActivationLayerInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include @@ -83,21 +85,29 @@ public: * @param[in] rhs Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs. * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs. * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. - * @param[in] settings Class containing flags for function level settings + * @param[in] settings Contains flags for function level settings + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. */ - void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}); + void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const + ActivationLayerInfo &act_info = ActivationLayerInfo{}); /** Initialise the kernel's inputs and output * * Similar to @ref CLMatMul::configure() */ - void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}); + void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const ActivationLayerInfo &act_info = + ActivationLayerInfo{}); /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul. * - * Similar to @ref CLMatMul::configure() * - * @return a status + * @note All tensors must have the same data type. + * + * @param[in] lhs Left-hand side (Matrix A) tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side (Matrix B) tensor info. Data types supported: same as @p lhs. + * @param[out] output Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs. + * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo{}); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h index a331c55a98..81fec19f86 100644 --- a/arm_compute/runtime/NEON/functions/NEMatMul.h +++ b/arm_compute/runtime/NEON/functions/NEMatMul.h @@ -24,6 +24,8 @@ #ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL #define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL +#include "arm_compute/core/ActivationLayerInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include @@ -91,16 +93,23 @@ public: * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. - * @param[in] settings Class containing flags for function level settings i.e fast math + * @param[in] settings Contains flags for function level settings i.e fast math + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. */ - void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings); + void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul * - * Parameters are similar to @ref NEMatMul::configure() + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings Contains flags for function level settings i.e fast math + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. * * @return Status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden void run() override; diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl index 90d485e815..9656a59728 100644 --- a/src/core/CL/cl_kernels/common/mat_mul.cl +++ b/src/core/CL/cl_kernels/common/mat_mul.cl @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "activation_float_helpers.h" #include "helpers.h" #include "tile_helpers.h" @@ -31,6 +32,7 @@ * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions. * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER) @@ -86,7 +88,7 @@ __kernel void mat_mul_native_nt_nt( }) const int rhs_z = z * rhs_h; - int k; + int k; for(k = 0; k <= K - K0; k += K0) { TILE(DATA_TYPE, M0, K0, a); @@ -111,7 +113,7 @@ __kernel void mat_mul_native_nt_nt( lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE); } -#ifdef K % K0 != 0 +#if K % K0 != 0 /* Leftover Loop */ for(; k < K; ++k) { @@ -147,6 +149,8 @@ __kernel void mat_mul_native_nt_nt( indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); } #endif // defined(MAT_MUL_NATIVE_NT_NT) @@ -158,6 +162,7 @@ __kernel void mat_mul_native_nt_nt( * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions. * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER) @@ -213,7 +218,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER), }) const int rhs_z = z * rhs_h; - int k; + int k; for(k = 0; k <= K - K0; k += K0) { TILE(DATA_TYPE, M0, K0, a); @@ -301,6 +306,8 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER), indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); } #endif // defined(MAT_MUL_NATIVE_NT_T) @@ -312,6 +319,7 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER), * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions. * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER) @@ -367,7 +375,7 @@ __kernel void mat_mul_native_t_nt( }) const int rhs_z = z * rhs_h; - int k; + int k; for(k = 0; k <= K - K0; k += K0) { TILE(DATA_TYPE, K0, M0, a); @@ -405,7 +413,7 @@ __kernel void mat_mul_native_t_nt( lhs_offset_first_element_in_bytes += K0 * lhs_stride_y; } -#ifdef K % K0 != 0 +#if K % K0 != 0 /* Leftover Loop */ for(; k < K; ++k) { @@ -451,6 +459,8 @@ __kernel void mat_mul_native_t_nt( indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); } #endif // defined(MAT_MUL_NATIVE_T_NT) @@ -462,6 +472,7 @@ __kernel void mat_mul_native_t_nt( * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions. * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER) @@ -517,7 +528,7 @@ __kernel void mat_mul_native_t_t( }) const int rhs_z = z * rhs_h; - int k; + int k; for(k = 0; k <= K - K0; k += K0) { TILE(DATA_TYPE, K0, M0, a); @@ -565,7 +576,7 @@ __kernel void mat_mul_native_t_t( lhs_offset_first_element_in_bytes += K0 * lhs_stride_y; } -#ifdef K % K0 != 0 +#if K % K0 != 0 /* Leftover Loop */ for(; k < K; ++k) { @@ -619,6 +630,8 @@ __kernel void mat_mul_native_t_t( indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); } #endif // defined(MAT_MUL_NATIVE_T_T) diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl index 0c3cbca9a6..bd415bb4a7 100644 --- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl +++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl @@ -23,6 +23,7 @@ */ #include "helpers.h" #include "tile_helpers.h" +#include "activation_float_helpers.h" #if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT) /** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only @@ -32,6 +33,7 @@ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations. * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -194,6 +196,8 @@ __kernel void mat_mul_native_quantized_nt_nt( const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0; const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; + T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + // Quantize the tile TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); @@ -216,6 +220,7 @@ __kernel void mat_mul_native_quantized_nt_nt( * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions. * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -315,7 +320,7 @@ __kernel void mat_mul_native_quantized_nt_t( rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE); } -#if ((K % K0) != 0) +#if((K % K0) != 0) // Leftover loop for(; k < K; ++k) { @@ -370,6 +375,8 @@ __kernel void mat_mul_native_quantized_nt_t( const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0; const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; + T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + // Quantize the tile TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); @@ -392,6 +399,7 @@ __kernel void mat_mul_native_quantized_nt_t( * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations. * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -548,6 +556,8 @@ __kernel void mat_mul_native_quantized_t_nt( const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0; const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; + T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + // Quantize the tile TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); @@ -570,6 +580,7 @@ __kernel void mat_mul_native_quantized_t_nt( * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar) * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) + * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations. * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -731,6 +742,8 @@ __kernel void mat_mul_native_quantized_t_t( const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; // Quantize the tile + T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); + TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index 87cb6c6b54..515b511044 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -25,9 +25,9 @@ #include "src/cpu/operators/CpuMatMul.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEMatMul.h" #include "src/common/utils/Log.h" @@ -45,7 +45,6 @@ namespace cpu { namespace { - Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { @@ -74,15 +73,14 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo return Status{}; } - -} +} // namespace CpuMatMul::CpuMatMul() : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape() { } -Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings) +Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); @@ -100,7 +98,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const TensorInfo rhs_transposed{}; auto gemm_info = AsmGemmInfo(); - gemm_info.activation_info = info.fused_activation(); + gemm_info.activation_info = act_info; gemm_info.fast_mode = settings.fast_math(); // Validate and then permute a/b @@ -139,7 +137,7 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const return Status{}; } -void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings) +void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings); @@ -189,7 +187,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, // ----------------------------------------------------- // Use transposed tensors if the corresponding transpose flags are set // Fill AsmGemmInfo class object before configuration - _gemm_info.activation_info = info.fused_activation(); + _gemm_info.activation_info = act_info; _gemm_info.fast_mode = settings.fast_math(); _gemm_info.negated_offsets = false; diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h index 9f5833b24f..475c019fd0 100644 --- a/src/cpu/operators/CpuMatMul.h +++ b/src/cpu/operators/CpuMatMul.h @@ -64,15 +64,17 @@ public: * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] settings The settings for matmul operation (i.e fast math) + * @param[in] act_info Class containing information about fused activation function. */ - void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings); + void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuMatMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp index d5ecdf7dd2..9bbec908a3 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp @@ -98,34 +98,36 @@ ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - if(output->total_size() != 0) + if(dst->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output); + const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } return Status{}; } -void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info); - ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info); + ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info)); // output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); - const int m = output->dimension(1); - const int n = output->dimension(0); + const int m = dst->dimension(1); + const int n = dst->dimension(0); const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); const bool adj_lhs = matmul_kernel_info.adj_lhs; @@ -133,7 +135,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context int n0 = adjust_vec_size(matmul_kernel_info.n0, n); // Configure kernel window - Window win = calculate_max_window(*output, Steps(n0, m0)); + Window win = calculate_max_window(*dst, Steps(n0, m0)); win = win.collapse(win, Window::DimZ); IClKernel::configure_internal(win); @@ -152,7 +154,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform(); const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform(); - const UniformQuantizationInfo dqinfo = output->quantization_info().uniform(); + const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform(); float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale; int output_multiplier = 0; @@ -166,6 +168,10 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset)); // Passed as positive (unlike the above two) + build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()))); + build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()))); + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + std::string kernel_name("mat_mul_native_quantized"); kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt"; kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt"; @@ -177,7 +183,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set config_id for enabling LWS tuning - const size_t number_of_batches = output->tensor_shape().total_size() / (m * n); + const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n); _config_id = kernel_name; _config_id += "_"; @@ -203,18 +209,18 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - ICLTensor *output = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output); + const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst); unsigned int idx = 0; Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ); add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - add_3d_tensor_nhw_argument(idx, output); + add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, window_collapsed, lws_hint()); } diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h index d70ff30b91..67d1a6601f 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h @@ -24,6 +24,7 @@ #ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL +#include "arm_compute/core/ActivationLayerInfo.h" #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -43,22 +44,24 @@ public: ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel); /** Initialise the kernel's input and output. * - * @param[in] compile_context The compile context to be used. - * @param[in] lhs Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8. - * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. - * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[out] dst Output tensor info. Data type supported: same as @p lhs - * @param[in] matmul_info Attributes for Batch MatMul Kernel + * @param[in] compile_context The compile context to be used. + * @param[in] lhs Input tensor for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8. + * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. + * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. + * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. + * @param[out] dst Output tensor info. Data type supported: same as @p lhs + * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel + * @param[in] act_info Class containing information about fused activation function. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMulLowpNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp index 8f53c1998f..205396a639 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp @@ -112,7 +112,7 @@ Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInf ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration"); } - return Status {}; + return Status{}; } } ClMatMulNativeKernel::ClMatMulNativeKernel() @@ -120,8 +120,9 @@ ClMatMulNativeKernel::ClMatMulNativeKernel() _type = CLKernelType::GEMM; } -Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) +Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { + ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); @@ -138,7 +139,8 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo return Status{}; } -void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) +void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info); @@ -176,6 +178,11 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT build_opts.add_option("-DK=" + support::cpp11::to_string(k)); build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER"); + // Define values for activation function + build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()))); + build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()))); + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + std::string kernel_name("mat_mul_native"); kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt"; kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt"; @@ -218,8 +225,8 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst); diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h index f706256e31..02d8ac3067 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h @@ -42,22 +42,24 @@ public: ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel); /** Initialise the kernel's input and output. * - * @param[in] compile_context The compile context to be used. - * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16. - * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. - * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[out] dst Output tensor info. Data type supported: same as @p lhs - * @param[in] matmul_info Attributes for Batch MatMul Kernel + * @param[in] compile_context The compile context to be used. + * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16. + * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. + * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. + * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. + * @param[out] dst Output tensor info. Data type supported: same as @p lhs + * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel + * @param[in] act_info Specifies activation function to use after Matrix multiplication. Default is Identity function. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMulNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index 3822c16aa1..c453761a8e 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -47,7 +47,7 @@ ClMatMul::ClMatMul() { } -Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info) +Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); @@ -57,15 +57,15 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const std::unique_ptr t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); + const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); + const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) : - ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info); + return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info) : + ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info, act_info); } -void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info) +void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info); @@ -86,14 +86,14 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l _matmul_lowp_native_kernel->set_target(gpu_target); // Configure the low-precision native matrix multiply kernel - _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info); + _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info); } else { _matmul_native_kernel->set_target(gpu_target); // Configure the native matrix multiply kernel - _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info); + _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info, act_info); } } diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index 6aba801301..9dce5288e6 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -21,14 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL -#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL +#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL #include "arm_compute/core/ActivationLayerInfo.h" #include "arm_compute/core/MatMulInfo.h" #include "src/gpu/cl/IClOperator.h" -#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include @@ -71,24 +71,26 @@ public: * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs. * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] act_info Class containing information about fused activation function. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info); + void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; private: - std::unique_ptr _matmul_native_kernel{nullptr}; - std::unique_ptr _matmul_lowp_native_kernel{nullptr}; + std::unique_ptr _matmul_native_kernel{ nullptr }; + std::unique_ptr _matmul_lowp_native_kernel{ nullptr }; bool _is_quantized{ false }; }; } // namespace opencl } // namespace arm_compute -#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */ +#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */ diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp index ae5a01f679..bef422fca1 100644 --- a/src/runtime/CL/functions/CLMatMul.cpp +++ b/src/runtime/CL/functions/CLMatMul.cpp @@ -42,25 +42,26 @@ CLMatMul::CLMatMul() CLMatMul::~CLMatMul() = default; -void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings) +void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(settings); - configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info); + configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info); } -void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings) +void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); ARM_COMPUTE_UNUSED(settings); _impl->op = std::make_unique(); - _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info); + _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info); _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; } -Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info) +Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) { - return OperatorType::validate(lhs, rhs, output, matmul_info); + return OperatorType::validate(lhs, rhs, output, matmul_info, act_info); } void CLMatMul::run() diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp index 0c46516f1e..58640f40ea 100644 --- a/src/runtime/NEON/functions/NEMatMul.cpp +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -25,9 +25,9 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuMatMul.h" -#include "arm_compute/runtime/Tensor.h" namespace arm_compute { @@ -49,7 +49,7 @@ NEMatMul::NEMatMul() NEMatMul::~NEMatMul() = default; -void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings) +void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { _impl->lhs = lhs; _impl->rhs = rhs; @@ -57,14 +57,14 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output); _impl->op = std::make_unique(); - _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings); + _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info); _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings) +Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) { - return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings); + return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info); } void NEMatMul::run() diff --git a/tests/validation/CL/MatMul.cpp b/tests/validation/CL/MatMul.cpp index 6364b16200..5a262a8e78 100644 --- a/tests/validation/CL/MatMul.cpp +++ b/tests/validation/CL/MatMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/runtime/CL/functions/CLMatMul.h" #include "tests/CL/CLAccessor.h" +#include "tests/datasets/ActivationFunctionsDataset.h" #include "tests/framework/DatasetModes.h" #include "tests/framework/Macros.h" #include "tests/framework/TestCase.h" @@ -44,11 +45,13 @@ namespace validation { namespace { -RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */ -constexpr float abs_tolerance_f32(0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */ -constexpr float abs_tolerance_f16(0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data type in case using relative tolerance fails because of small values */ -RelativeTolerance tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */ -constexpr AbsoluteTolerance tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ +RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */ +constexpr float abs_tolerance_f32( + 0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */ +constexpr float abs_tolerance_f16( + 0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data type in case using relative tolerance fails because of small values */ +RelativeTolerance tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */ +constexpr AbsoluteTolerance tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ } // namespace template @@ -57,25 +60,71 @@ using CLMatMulFixture = MatMulValidationFixture using CLQuantizedMatMulFixture = QuantizedMatMulValidationFixture; +template +using CLMatMulActivationFixture = MatMulValidationWithActivationFixture; + +template +using CLMatMulActivationAlphaBetaFixture = MatMulValidationWithActivationAlphaBetaFixture; + +template +using CLQuantizedMatMulActivationFixture = QuantizedMatMulValidationWithActivationFixture; + +/* The main act functions matmul is expected to support */ +const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo", +{ + ActivationLayerInfo(), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH) +}); + +const auto ActivationFunctionsQuantizedDataset = concat(concat(concat( + framework::dataset::make("ActivationInfo", ActivationLayerInfo()), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f))), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f))); + +/* Larger activation functions dataset, used during some nightly tests. */ +const auto AllActivationsDataset = combine(datasets::ActivationFunctions(), framework::dataset::make("AlphaBeta", { 0.5f, 1.f })); + +const auto AllQuantizedActivationsDataset = combine(concat(datasets::ActivationFunctionsQuantized(), + framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU + })), + framework::dataset::make("AlphaBeta", { 0.5f, 1.f })); + TEST_SUITE(CL) TEST_SUITE(MatMul) TEST_SUITE(Float) TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(), +FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F32)), + ActivationFunctionsDataset)) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(), framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::F32))) + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F32)), + ActivationFunctionsDataset)) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunAllActivations, CLMatMulActivationAlphaBetaFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallerMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + framework::dataset::make("DataType", DataType::F32)), + AllActivationsDataset)) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); @@ -85,19 +134,21 @@ TEST_SUITE_END() // FP32 TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::F16))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F16)), + ActivationFunctionsDataset)) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::F16))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F16)), + ActivationFunctionsDataset)) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); @@ -110,32 +161,30 @@ TEST_SUITE(Quantized) TEST_SUITE(QASYMM8) FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine( - datasets::SmallMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), - framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), - framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })), - framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })), - framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) -) + datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8)), + ActivationFunctionsQuantizedDataset), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); } FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine( - datasets::LargeMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), - framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), - framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), - framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), - framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) -) + datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8)), + ActivationFunctionsQuantizedDataset), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); @@ -146,32 +195,45 @@ TEST_SUITE_END() // QASYMM8 TEST_SUITE(QASYMM8_SIGNED) FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine( - datasets::SmallMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), - framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), - framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })), - framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })), - framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) -) + datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + ActivationFunctionsQuantizedDataset), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); } FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine( - datasets::LargeMatMulDataset(), - framework::dataset::make("TransposeA", { false, true })), - framework::dataset::make("TransposeB", { false, true })), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), - framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), - framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), - framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), - framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) -) + datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + ActivationFunctionsQuantizedDataset), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_quant); +} + +FIXTURE_DATA_TEST_CASE(RunAllActivations, CLQuantizedMatMulActivationFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine( + datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + AllQuantizedActivationsDataset), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h index 2f94c1f9d2..3e4cac5e34 100644 --- a/tests/validation/fixtures/MatMulFixture.h +++ b/tests/validation/fixtures/MatMulFixture.h @@ -112,14 +112,14 @@ protected: // Configure MatMulInfo class MatMulInfo mm_info; - mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b).fused_activation(act_info); + mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b); // Ensure values are dynamic a.info()->set_are_values_constant(false); b.info()->set_are_values_constant(false); // Configure operator - matmul.configure(&a, &b, &dst, mm_info, settings); + matmul.configure(&a, &b, &dst, mm_info, settings, act_info); // Assertions ARM_COMPUTE_ASSERT(a.info()->is_resizable()); @@ -162,8 +162,8 @@ protected: } template - typename std::enable_if::value, SimpleTensor>::type - compute_reference_gemm(const SimpleTensor &a, const SimpleTensor &b, const SimpleTensor &c, float alpha, float beta, const QuantizationInfo &o_qinfo) + typename std::enable_if < !std::is_integral::value, SimpleTensor>::type + compute_reference_gemm(const SimpleTensor &a, const SimpleTensor &b, const SimpleTensor &c, float alpha, float beta, const QuantizationInfo &o_qinfo) { ARM_COMPUTE_UNUSED(o_qinfo); @@ -172,7 +172,7 @@ protected: template typename std::enable_if::value, SimpleTensor>::type - compute_reference_gemm(const SimpleTensor &a, const SimpleTensor &b, const SimpleTensor &c, float alpha, float beta, const QuantizationInfo &o_qinfo) + compute_reference_gemm(const SimpleTensor &a, const SimpleTensor &b, const SimpleTensor &c, float alpha, float beta, const QuantizationInfo &o_qinfo) { ARM_COMPUTE_UNUSED(alpha, beta); @@ -183,18 +183,18 @@ protected: const auto multiplier = aq.scale * bq.scale / oq.scale; int32_t output_multiplier = 0; - int32_t output_shift = 0; + int32_t output_shift = 0; quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); std::vector output_multipliers{ output_multiplier }; std::vector output_shifts{ output_shift }; //The lhs and rhs offsets are negated here to keep the reference aligned with the function implementation where the lhs and rhs offsets are also negated. const auto tmp = reference::gemmlowp_matrix_multiply_core( - a, b, c.shape(), -aq.offset, -bq.offset); + a, b, c.shape(), -aq.offset, -bq.offset); auto output = reference::gemmlowp_quantize_down_scale_by_fixedpoint( - tmp, output_multipliers, output_shifts, oq.offset, - std::numeric_limits::lowest(), std::numeric_limits::max()); + tmp, output_multipliers, output_shifts, oq.offset, + std::numeric_limits::lowest(), std::numeric_limits::max()); output.quantization_info(o_qinfo); return output; @@ -279,6 +279,30 @@ public: } }; +template +class MatMulValidationWithDynamicTensorsFixture : public MatMulGenericValidationFixture +{ +public: + template + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs) + { + MatMulGenericValidationFixture::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings()); + } +}; + +template +class QuantizedMatMulValidationFixture : public MatMulGenericValidationFixture +{ +public: + template + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs, + QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo) + { + MatMulGenericValidationFixture::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(), + a_qinfo, b_qinfo, o_qinfo); + } +}; + template class MatMulValidationWithActivationFixture : public MatMulGenericValidationFixture { @@ -291,24 +315,30 @@ public: }; template -class MatMulValidationWithDynamicTensorsFixture : public MatMulGenericValidationFixture +class MatMulValidationWithActivationAlphaBetaFixture : public MatMulGenericValidationFixture { public: template - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo::ActivationFunction function, + float alpha_beta) { - MatMulGenericValidationFixture::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings()); + ActivationLayerInfo act_info(function, alpha_beta, alpha_beta); + MatMulGenericValidationFixture::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, 0, Settings()); } }; template -class QuantizedMatMulValidationFixture : public MatMulGenericValidationFixture +class QuantizedMatMulValidationWithActivationFixture : public MatMulGenericValidationFixture { public: template - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo act_info, int num_extra_runs, QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool transpose_a, bool transpose_b, DataType data_type, ActivationLayerInfo::ActivationFunction function, + float alpha_beta, int num_extra_runs, + QuantizationInfo a_qinfo, QuantizationInfo b_qinfo, QuantizationInfo o_qinfo) { - MatMulGenericValidationFixture::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(), a_qinfo, b_qinfo, o_qinfo); + ActivationLayerInfo act_info(function, alpha_beta, alpha_beta); + MatMulGenericValidationFixture::setup(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(), + a_qinfo, b_qinfo, o_qinfo); } }; diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 0df320d7e0..de8a960e41 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -35,11 +35,11 @@ #include "arm_compute/core/GEMMInfo.h" #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/MatMulInfo.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Strides.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/MatMulInfo.h" #include "arm_compute/core/experimental/IPostOp.h" #include "arm_compute/core/experimental/PostOps.h" #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h" @@ -3691,9 +3691,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::MatMulI os << "MatMulKernelInfo=" << "[" << "adj_lhs=" << matmul_info.adj_lhs() << ", " - << "adj_rhs=" << matmul_info.adj_rhs() << ", " - << "fused_activation=" << matmul_info.fused_activation().activation() << "]"; - + << "adj_rhs=" << matmul_info.adj_rhs() << "] "; return os; } /** Formatted output of the arm_compute::MatMulInfo type. -- cgit v1.2.1