aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2023-09-19 17:57:29 +0100
committerGunes Bayir <gunes.bayir@arm.com>2023-09-21 13:24:18 +0000
commit1f841a52f9a7f52948d676bc3807461bbed6f70a (patch)
treef6fa1ca12e76a12b3a90dd6d170cd27160d461d6 /src/core
parente9fd8b4f14f64aa23ec8554b619a4aa49d5e3183 (diff)
downloadComputeLibrary-1f841a52f9a7f52948d676bc3807461bbed6f70a.tar.gz
Optimize the main loop in mat_mul_native_quantized_mmul_nt_nt
Switch the loop unrolling order and reuse the pre-computed vectors Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Change-Id: I636c0530d6b21dae4dbb371c57d18b1f7c7246a8 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10355 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl26
1 files changed, 11 insertions, 15 deletions
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
index 9123e5bc95..5b29a3117c 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
@@ -269,14 +269,21 @@ __kernel void mat_mul_native_quantized_mmul_nt_nt(
T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
- LOOP_UNROLLING(int, m0, 0, 1, M0,
+ LOOP_UNROLLING(int, n0, 0, 1, N0,
{
- LOOP_UNROLLING(int, n0, 0, 1, N0,
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
{
- VEC_DATA_TYPE(DATA_TYPE, K0)
- vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]);
})
+
+#if LHS_OFFSET != 0
+ // Column Sum of B: Calculate the sum of columns by multiplying B
+ // with a matrix of 1's from Left
+ b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+#endif // LHS_OFFSET != 0s
})
#if RHS_OFFSET != 0
@@ -288,17 +295,6 @@ __kernel void mat_mul_native_quantized_mmul_nt_nt(
})
#endif // RHS_OFFSET != 0
-#if LHS_OFFSET != 0
- // Column Sum of B: Calculate the sum of columns by multiplying B
- // with a matrix of 1's from Left
- LOOP_UNROLLING(int, n0, 0, 1, N0,
- {
- VEC_DATA_TYPE(DATA_TYPE, K0)
- vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
- b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
- })
-#endif // LHS_OFFSET != 0
-
lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
}