From c9eeee5c84ad817360a1719c538c6e6c0812ec13 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Fri, 30 Jun 2023 15:43:29 +0100 Subject: Fix nightly failures in MatMulLowpNativeKernel when using bounded activation functions - Added checks for supported activation functions in MatMulLowpKernel validate - Replaced incorrect float activation macro with quantized implementation in mat_mul_quantized Resolves: [COMPMID-6339] Signed-off-by: Mohammed Suhail Munshi Change-Id: I15661f14877f1d3305644e6473feb5482a67e773 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/532858 Tested-by: bsgcomp Reviewed-by: Pablo Tello Comments-Addressed: bsgcomp Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9855 Comments-Addressed: Arm Jenkins Reviewed-by: SiCong Li Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- src/core/CL/cl_kernels/common/mat_mul_quantized.cl | 20 ++++++++++------- src/core/CL/cl_kernels/tile_helpers.h | 26 ++++++++++++---------- 2 files changed, 26 insertions(+), 20 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl index 8cf857dd84..7029af2188 100644 --- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl +++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl @@ -34,6 +34,7 @@ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations. + * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -196,12 +197,12 @@ __kernel void mat_mul_native_quantized_nt_nt( const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0; const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; - T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); - // Quantize the tile TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); + T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq); + TILE(int, M0, 1, indirect_buffer); LOOP_UNROLLING(int, _i, 0, 1, M0, { @@ -221,6 +222,7 @@ __kernel void mat_mul_native_quantized_nt_nt( * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions. + * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -375,12 +377,12 @@ __kernel void mat_mul_native_quantized_nt_t( const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0; const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; - T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); - // Quantize the tile TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); + T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq); + TILE(int, M0, 1, indirect_buffer); LOOP_UNROLLING(int, _i, 0, 1, M0, { @@ -400,6 +402,7 @@ __kernel void mat_mul_native_quantized_nt_t( * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations. + * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -556,12 +559,12 @@ __kernel void mat_mul_native_quantized_t_nt( const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0; const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; - T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); - // Quantize the tile TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); + T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq); + TILE(int, M0, 1, indirect_buffer); LOOP_UNROLLING(int, _i, 0, 1, M0, { @@ -581,6 +584,7 @@ __kernel void mat_mul_native_quantized_t_nt( * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4). * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3) * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations. + * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6) * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T) * @note Only the following configurations of M0, N0 and K0 are currently supported: @@ -742,11 +746,11 @@ __kernel void mat_mul_native_quantized_t_t( const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0; // Quantize the tile - T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); - TILE(DATA_TYPE, M0, N0, accq); T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq); + T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq); + TILE(int, M0, 1, indirect_buffer); LOOP_UNROLLING(int, _i, 0, 1, M0, { diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 85bd59afd4..8129606277 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -1144,19 +1144,21 @@ }) \ }) + +// NOTE : A_VAL and B_VAL should be quantized values (using same quantization info as x) // RELU Activation -#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x)) +#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_POINT, x)) // Bounded RELU Activation -#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x))) +#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_POINT, x))) // Lower Upper Bounded RELU Activation -#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) +#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) // Hard Swish Activation -#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) +#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) // Identity Activation -#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x) +#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (x) -#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) -#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) +#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) +#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) #define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) #define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL)) @@ -1171,17 +1173,17 @@ * @param[in] M0 Number of SRC/DST rows * @param[in] N0 Number of SRC/DST columns * @param[in] ACTIVATION_TYPE Activation type - * @param[in] ZERO_VALUE The zero value to consider in the computation - * @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..) - * @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..) + * @param[in] ZERO_POINT The zero value to consider in the computation + * @param[in] A_VAL Quantized A value used for the activation (e.g. tanh_op, brelu,..) + * @param[in] B_VAL Quantized B value used for the activation (e.g. tanh_op, brelu,..) * @param[out] src SRC tile * @param[out] dst DST tile */ -#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \ +#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, src, dst) \ ({ \ LOOP_UNROLLING(int, _m0, 0, 1, M0, \ { \ - dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \ + dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_POINT, A_VAL, B_VAL, src[_m0].v); \ }) \ }) -- cgit v1.2.1