aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-09 18:35:17 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-18 13:41:40 +0000
commit7cd26d4a1b14bc4bf7c61496803416ab3d84791f (patch)
tree12cc4a27d7ecebc69a43e96b1f46c7eb05437978 /src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8
parent3ac2f3a1d9297220d1b0ce920dd13fdd4edcc187 (diff)
downloadComputeLibrary-7cd26d4a1b14bc4bf7c61496803416ab3d84791f.tar.gz
COMPMID-1867: Add NEON/SVE GEMM Hybrid kernels.
Change-Id: Ib40a9921e7f9a6a8be6c38872d6b3a0f24ed0cd3 Reviewed-on: https://review.mlplatform.org/515 Reviewed-by: Anthony Barbier <Anthony.barbier@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp46
1 files changed, 20 insertions, 26 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 2e994a13f3..d679c211ef 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,22 +49,22 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"mov z8.s, #0\n"
"ptrue p0.b\n"
"mov z9.s, #0\n"
- "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -205,37 +205,31 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z31.s, z6.b, z3.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z0.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z0.b[1]\n"
"sdot z10.s, z4.b, z0.b[2]\n"
"sdot z11.s, z4.b, z0.b[3]\n"
"sdot z20.s, z4.b, z1.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z1.b[1]\n"
"sdot z22.s, z4.b, z1.b[2]\n"
"sdot z23.s, z4.b, z1.b[3]\n"
"sdot z12.s, z5.b, z0.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z0.b[1]\n"
"sdot z14.s, z5.b, z0.b[2]\n"
"sdot z15.s, z5.b, z0.b[3]\n"
"sdot z24.s, z5.b, z1.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z1.b[1]\n"
"sdot z26.s, z5.b, z1.b[2]\n"
"sdot z27.s, z5.b, z1.b[3]\n"
"sdot z16.s, z6.b, z0.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z0.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z1.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z1.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z1.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z1.b[3]\n"
"b 4f\n"
"3:\n"
@@ -270,39 +264,39 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z31.s, z6.b, z1.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z2.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z2.b[1]\n"
"sdot z10.s, z4.b, z2.b[2]\n"
"sdot z11.s, z4.b, z2.b[3]\n"
"sdot z20.s, z4.b, z3.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z3.b[1]\n"
"sdot z22.s, z4.b, z3.b[2]\n"
"sdot z23.s, z4.b, z3.b[3]\n"
"sdot z12.s, z5.b, z2.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z2.b[1]\n"
"sdot z14.s, z5.b, z2.b[2]\n"
"sdot z15.s, z5.b, z2.b[3]\n"
"sdot z24.s, z5.b, z3.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z3.b[1]\n"
"sdot z26.s, z5.b, z3.b[2]\n"
"sdot z27.s, z5.b, z3.b[3]\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z2.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z2.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z3.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z3.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z3.b[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"