From 1f841a52f9a7f52948d676bc3807461bbed6f70a Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Tue, 19 Sep 2023 17:57:29 +0100 Subject: Optimize the main loop in mat_mul_native_quantized_mmul_nt_nt Switch the loop unrolling order and reuse the pre-computed vectors Signed-off-by: Gunes Bayir Change-Id: I636c0530d6b21dae4dbb371c57d18b1f7c7246a8 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10355 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins --- .../CL/cl_kernels/common/mat_mul_quantized_mmul.cl | 26 +++++++++------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl index 9123e5bc95..5b29a3117c 100644 --- a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl +++ b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl @@ -269,14 +269,21 @@ __kernel void mat_mul_native_quantized_mmul_nt_nt( T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b); - LOOP_UNROLLING(int, m0, 0, 1, M0, + LOOP_UNROLLING(int, n0, 0, 1, N0, { - LOOP_UNROLLING(int, n0, 0, 1, N0, + VEC_DATA_TYPE(DATA_TYPE, K0) + vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]); + + LOOP_UNROLLING(int, m0, 0, 1, M0, { - VEC_DATA_TYPE(DATA_TYPE, K0) - vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]); c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]); }) + +#if LHS_OFFSET != 0 + // Column Sum of B: Calculate the sum of columns by multiplying B + // with a matrix of 1's from Left + b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]); +#endif // LHS_OFFSET != 0s }) #if RHS_OFFSET != 0 @@ -288,17 +295,6 @@ __kernel void mat_mul_native_quantized_mmul_nt_nt( }) #endif // RHS_OFFSET != 0 -#if LHS_OFFSET != 0 - // Column Sum of B: Calculate the sum of columns by multiplying B - // with a matrix of 1's from Left - LOOP_UNROLLING(int, n0, 0, 1, N0, - { - VEC_DATA_TYPE(DATA_TYPE, K0) - vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]); - b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]); - }) -#endif // LHS_OFFSET != 0 - lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE); rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y; } -- cgit v1.2.1