From 7d9a626aaba9837cb82d189a9c4f0bcef58825bb Mon Sep 17 00:00:00 2001 From: Michael Tyler Date: Wed, 1 Feb 2023 16:37:07 +0000 Subject: Update CPU kernels to remove x19 and w19 Resolves: COMPMID-5805 Change-Id: Idf720bbb136474810086f5089c5ed23b3f79835a Signed-off-by: Michael Tyler Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9081 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Gunes Bayir Reviewed-by: Viet-Hoa Do --- .../a64_interleave4_block16_s8_s8.hpp | 170 ++++---- .../a64_interleave4_block16_s8_s8_summing.hpp | 196 ++++----- .../a64_interleave4_block16_u8_u8_summing.hpp | 196 ++++----- .../a64_interleave8_block1_bf16_fp32.hpp | 240 +++++------ .../a64_interleave8_block1_fp16_fp16.hpp | 276 ++++++------ .../a64_interleave8_block1_fp16_fp32.hpp | 230 +++++----- .../a64_interleave8_block1_fp32_fp32.hpp | 212 ++++----- .../a64_interleave8_block1_s16_s16.hpp | 276 ++++++------ .../a64_interleave8_block1_s16_s16_summing.hpp | 288 ++++++------- .../a64_interleave8_block1_s8_s16.hpp | 336 +++++++-------- .../a64_interleave8_block1_s8_s16_summing.hpp | 332 +++++++------- .../a64_interleave8_block1_u16_u16_summing.hpp | 288 ++++++------- .../a64_interleave8_block1_u8_u16.hpp | 336 +++++++-------- .../a64_interleave8_block1_u8_u16_summing.hpp | 332 +++++++------- .../a64_interleave8_block2_bf16_bf16.hpp | 290 ++++++------- .../a64_interleave8_block2_fp32_fp32.hpp | 186 ++++---- .../a64_interleave8_block4_bf16_bf16.hpp | 254 +++++------ .../a64_interleave8_block4_fp32_bf16.hpp | 148 +++---- .../a64_interleave8_block4_s8_s8.hpp | 426 +++++++++--------- .../a64_interleave8_block4_s8_s8_summing.hpp | 478 ++++++++++----------- .../a64_interleave8_block4_u8_u8_summing.hpp | 478 ++++++++++----------- .../a64_interleave8_block8_s8_s8.hpp | 390 ++++++++--------- .../a64_interleave8_block8_s8_s8_summing.hpp | 440 +++++++++---------- .../a64_interleave8_block8_u8_u8_summing.hpp | 440 +++++++++---------- .../sme2_interleave1VL_block2_fp32_bf16.hpp | 136 +++--- .../sme2_interleave2VL_block2_fp32_bf16.hpp | 162 +++---- .../sme2_interleave4VL_block2_fp32_bf16.hpp | 138 +++--- .../sme_interleave1VL_bf16_bf16.hpp | 226 +++++----- .../sme_interleave1VL_block2_bf16_bf16.hpp | 286 ++++++------ .../sme_interleave1VL_block4_s8_s8.hpp | 242 +++++------ .../sme_interleave1VL_block4_s8_s8_summing.hpp | 294 ++++++------- .../sme_interleave1VL_block4_u8_u8.hpp | 242 +++++------ .../sme_interleave1VL_block4_u8_u8_summing.hpp | 294 ++++++------- .../sme_interleave1VL_fp16_fp16.hpp | 226 +++++----- .../sme_interleave1VL_fp32_fp32.hpp | 268 ++++++------ .../sme_interleave2VL_bf16_bf16.hpp | 84 ++-- .../sme_interleave2VL_block2_bf16_bf16.hpp | 448 +++++++++---------- .../sme_interleave2VL_block2_fp16_fp16.hpp | 448 +++++++++---------- .../sme_interleave2VL_block4_s8_s8.hpp | 440 +++++++++---------- .../sme_interleave2VL_block4_s8_s8_summing.hpp | 468 ++++++++++---------- .../sme_interleave2VL_block4_u8_u8.hpp | 440 +++++++++---------- .../sme_interleave2VL_block4_u8_u8_summing.hpp | 448 +++++++++---------- .../sme_interleave2VL_fp16_fp16.hpp | 84 ++-- .../sme_interleave2VL_fp32_fp32.hpp | 430 +++++++++--------- .../sme_interleave4VL_block2_bf16_bf16.hpp | 134 +++--- .../sme_interleave4VL_block4_s8_s8.hpp | 116 ++--- .../sme_interleave4VL_block4_s8_s8_summing.hpp | 194 ++++----- .../sme_interleave4VL_block4_u8_u8.hpp | 116 ++--- .../sme_interleave4VL_block4_u8_u8_summing.hpp | 190 ++++---- .../sme_interleave4VL_fp32_fp32.hpp | 128 +++--- 50 files changed, 6960 insertions(+), 6960 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/indirect-interleaves') diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp index 6a8caf6ce6..4dfe46446e 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,44 @@ void interleave_block<4, 16, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x22, [%x[in], #0x0]\n" + "ldr x23, [%x[in], #0x0]\n" + "ldr x22, [%x[in], #0x8]\n" "cmp %x[height], #0x4\n" - "ldr x21, [%x[in], #0x8]\n" + "add x23, x23, %x[row_offset]\n" + "ldr x21, [%x[in], #0x10]\n" + "ldr x20, [%x[in], #0x18]\n" "add x22, x22, %x[row_offset]\n" - "ldr x20, [%x[in], #0x10]\n" - "ldr x19, [%x[in], #0x18]\n" "add x21, x21, %x[row_offset]\n" "add x20, x20, %x[row_offset]\n" - "add x19, x19, %x[row_offset]\n" "beq 1f\n" - "mov x19, x22\n" "cmp %x[height], #0x2\n" - "csel x21, x21, x22, GE\n" - "csel x20, x20, x22, GT\n" + "mov x20, x23\n" + "csel x22, x22, x23, GE\n" + "csel x21, x21, x23, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x22, #0x0]\n" "cmp %x[width], #0x10\n" + "prfm pldl1keep, [x23, #0x0]\n" + "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" "prfm pldl1keep, [x20, #0x0]\n" - "prfm pldl1keep, [x19, #0x0]\n" + "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" "prfm pldl1keep, [x20, #0x40]\n" - "prfm pldl1keep, [x19, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q19, [x22], #0x10\n" + "ldr q19, [x23], #0x10\n" + "ldr q18, [x22], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q18, [x21], #0x10\n" "cmp %x[width], #0x10\n" - "ldr q17, [x20], #0x10\n" - "ldr q16, [x19], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q19, [%x[out_ptr], #0x0]\n" + "prfm pldl1keep, [x23, #0x70]\n" "prfm pldl1keep, [x22, #0x70]\n" + "str q18, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x21, #0x70]\n" "prfm pldl1keep, [x20, #0x70]\n" - "prfm pldl1keep, [x19, #0x70]\n" - "str q19, [%x[out_ptr], #0x0]\n" - "str q18, [%x[out_ptr], #0x10]\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" @@ -76,93 +76,93 @@ void interleave_block<4, 16, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 12f\n" "tbz %x[width], #3, 7f\n" - "ldr d19, [x22], #0x8\n" - "ldr d18, [x21], #0x8\n" - "ldr d17, [x20], #0x8\n" - "ldr d16, [x19], #0x8\n" + "ldr d19, [x23], #0x8\n" + "ldr d18, [x22], #0x8\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" "tbz %x[width], #2, 5f\n" - "ld1 { v19.s }[2], [x22], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" - "ld1 { v17.s }[2], [x20], #0x4\n" - "ld1 { v16.s }[2], [x19], #0x4\n" + "ld1 { v19.s }[2], [x23], #0x4\n" + "ld1 { v18.s }[2], [x22], #0x4\n" + "ld1 { v17.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v19.h }[6], [x22], #0x2\n" - "ld1 { v18.h }[6], [x21], #0x2\n" - "ld1 { v17.h }[6], [x20], #0x2\n" - "ld1 { v16.h }[6], [x19], #0x2\n" + "ld1 { v19.h }[6], [x23], #0x2\n" + "ld1 { v18.h }[6], [x22], #0x2\n" + "ld1 { v17.h }[6], [x21], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[14], [x22]\n" - "ld1 { v18.b }[14], [x21]\n" - "ld1 { v17.b }[14], [x20]\n" - "ld1 { v16.b }[14], [x19]\n" + "ld1 { v19.b }[14], [x23]\n" + "ld1 { v18.b }[14], [x22]\n" + "ld1 { v17.b }[14], [x21]\n" + "ld1 { v16.b }[14], [x20]\n" "b 11f\n" "4:" // odd_loads_1_12 "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[12], [x22]\n" - "ld1 { v18.b }[12], [x21]\n" - "ld1 { v17.b }[12], [x20]\n" - "ld1 { v16.b }[12], [x19]\n" + "ld1 { v19.b }[12], [x23]\n" + "ld1 { v18.b }[12], [x22]\n" + "ld1 { v17.b }[12], [x21]\n" + "ld1 { v16.b }[12], [x20]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" - "ld1 { v19.h }[4], [x22], #0x2\n" - "ld1 { v18.h }[4], [x21], #0x2\n" - "ld1 { v17.h }[4], [x20], #0x2\n" - "ld1 { v16.h }[4], [x19], #0x2\n" + "ld1 { v19.h }[4], [x23], #0x2\n" + "ld1 { v18.h }[4], [x22], #0x2\n" + "ld1 { v17.h }[4], [x21], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[10], [x22]\n" - "ld1 { v18.b }[10], [x21]\n" - "ld1 { v17.b }[10], [x20]\n" - "ld1 { v16.b }[10], [x19]\n" + "ld1 { v19.b }[10], [x23]\n" + "ld1 { v18.b }[10], [x22]\n" + "ld1 { v17.b }[10], [x21]\n" + "ld1 { v16.b }[10], [x20]\n" "b 11f\n" "6:" // odd_loads_1_8 "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[8], [x22]\n" - "ld1 { v18.b }[8], [x21]\n" - "ld1 { v17.b }[8], [x20]\n" - "ld1 { v16.b }[8], [x19]\n" + "ld1 { v19.b }[8], [x23]\n" + "ld1 { v18.b }[8], [x22]\n" + "ld1 { v17.b }[8], [x21]\n" + "ld1 { v16.b }[8], [x20]\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" - "ldr s19, [x22], #0x4\n" - "ldr s18, [x21], #0x4\n" - "ldr s17, [x20], #0x4\n" - "ldr s16, [x19], #0x4\n" + "ldr s19, [x23], #0x4\n" + "ldr s18, [x22], #0x4\n" + "ldr s17, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" "tbz %x[width], #1, 8f\n" - "ld1 { v19.h }[2], [x22], #0x2\n" - "ld1 { v18.h }[2], [x21], #0x2\n" - "ld1 { v17.h }[2], [x20], #0x2\n" - "ld1 { v16.h }[2], [x19], #0x2\n" + "ld1 { v19.h }[2], [x23], #0x2\n" + "ld1 { v18.h }[2], [x22], #0x2\n" + "ld1 { v17.h }[2], [x21], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[6], [x22]\n" - "ld1 { v18.b }[6], [x21]\n" - "ld1 { v17.b }[6], [x20]\n" - "ld1 { v16.b }[6], [x19]\n" + "ld1 { v19.b }[6], [x23]\n" + "ld1 { v18.b }[6], [x22]\n" + "ld1 { v17.b }[6], [x21]\n" + "ld1 { v16.b }[6], [x20]\n" "b 11f\n" "8:" // odd_loads_1_4 "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[4], [x22]\n" - "ld1 { v18.b }[4], [x21]\n" - "ld1 { v17.b }[4], [x20]\n" - "ld1 { v16.b }[4], [x19]\n" + "ld1 { v19.b }[4], [x23]\n" + "ld1 { v18.b }[4], [x22]\n" + "ld1 { v17.b }[4], [x21]\n" + "ld1 { v16.b }[4], [x20]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" - "ldr h19, [x22], #0x2\n" - "ldr h18, [x21], #0x2\n" - "ldr h17, [x20], #0x2\n" - "ldr h16, [x19], #0x2\n" + "ldr h19, [x23], #0x2\n" + "ldr h18, [x22], #0x2\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v19.b }[2], [x22]\n" - "ld1 { v18.b }[2], [x21]\n" - "ld1 { v17.b }[2], [x20]\n" - "ld1 { v16.b }[2], [x19]\n" + "ld1 { v19.b }[2], [x23]\n" + "ld1 { v18.b }[2], [x22]\n" + "ld1 { v17.b }[2], [x21]\n" + "ld1 { v16.b }[2], [x20]\n" "b 11f\n" "10:" // odd_loads_1_0 - "ldr b19, [x22, #0x0]\n" - "ldr b18, [x21, #0x0]\n" - "ldr b17, [x20, #0x0]\n" - "ldr b16, [x19, #0x0]\n" + "ldr b19, [x23, #0x0]\n" + "ldr b18, [x22, #0x0]\n" + "ldr b17, [x21, #0x0]\n" + "ldr b16, [x20, #0x0]\n" "11:" // Odd load end "str q19, [%x[out_ptr], #0x0]\n" "str q18, [%x[out_ptr], #0x10]\n" @@ -173,7 +173,7 @@ void interleave_block<4, 16, VLType::None, false>( : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22" + : "cc", "memory", "v16", "v17", "v18", "v19", "x20", "x21", "x22", "x23" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp index 954a86656e..56ca49a36e 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,39 +31,39 @@ void interleave_block<4, 16, VLType::None, true>( ) { __asm__ __volatile__( - "movi v28.8h, #0x0\n" - "ldr x23, [%x[in], #0x0]\n" + "ldr x24, [%x[in], #0x0]\n" + "ldr x23, [%x[in], #0x8]\n" + "cmp %x[height], #0x4\n" "mov x22, #0x0\n" + "ldr x21, [%x[in], #0x10]\n" + "ldr x20, [%x[in], #0x18]\n" + "movi v28.8h, #0x0\n" "movi v27.8h, #0x0\n" - "ldr x21, [%x[in], #0x8]\n" - "cmp %x[height], #0x4\n" "movi v26.8h, #0x0\n" - "ldr x20, [%x[in], #0x10]\n" - "add x23, x23, %x[row_offset]\n" "movi v25.8h, #0x0\n" - "ldr x19, [%x[in], #0x18]\n" + "add x24, x24, %x[row_offset]\n" + "add x23, x23, %x[row_offset]\n" "movi v24.4s, #0x0\n" - "add x21, x21, %x[row_offset]\n" "movi v23.4s, #0x0\n" + "add x21, x21, %x[row_offset]\n" "add x20, x20, %x[row_offset]\n" "movi v22.4s, #0x0\n" - "add x19, x19, %x[row_offset]\n" "movi v21.4s, #0x0\n" "beq 1f\n" - "mov x19, x23\n" "cmp %x[height], #0x2\n" - "csel x21, x21, x23, GE\n" - "csel x20, x20, x23, GT\n" + "mov x20, x24\n" + "csel x23, x23, x24, GE\n" + "csel x21, x21, x24, GT\n" "1:" // no_pointer_adj - "movi v20.4s, #0x0\n" + "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" + "movi v20.4s, #0x0\n" "prfm pldl1keep, [x21, #0x0]\n" "prfm pldl1keep, [x20, #0x0]\n" - "prfm pldl1keep, [x19, #0x0]\n" + "prfm pldl1keep, [x24, #0x40]\n" "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" "prfm pldl1keep, [x20, #0x40]\n" - "prfm pldl1keep, [x19, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x10\n" "ld1 { v20.4s }, [%x[out_ptr]]\n" @@ -75,141 +75,141 @@ void interleave_block<4, 16, VLType::None, true>( "ble 4f\n" "sadalp v24.4s, v28.8h\n" "movi v28.8h, #0x0\n" + "mov x22, #0x0\n" "sadalp v23.4s, v27.8h\n" "movi v27.8h, #0x0\n" "sadalp v22.4s, v26.8h\n" "movi v26.8h, #0x0\n" "sadalp v21.4s, v25.8h\n" "movi v25.8h, #0x0\n" - "mov x22, #0x0\n" "4:" // no_accumulate_16 - "ldr q19, [x23], #0x10\n" - "add x22, x22, #0x1\n" - "ldr q18, [x21], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x23], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q17, [x20], #0x10\n" "cmp %x[width], #0x10\n" - "ldr q16, [x19], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q19, [%x[out_ptr], #0x0]\n" "sadalp v28.8h, v19.16b\n" + "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" - "prfm pldl1keep, [x21, #0x70]\n" + "str q18, [%x[out_ptr], #0x10]\n" "sadalp v27.8h, v18.16b\n" + "prfm pldl1keep, [x21, #0x70]\n" "prfm pldl1keep, [x20, #0x70]\n" - "sadalp v26.8h, v17.16b\n" - "prfm pldl1keep, [x19, #0x70]\n" - "sadalp v25.8h, v16.16b\n" - "str q19, [%x[out_ptr], #0x0]\n" - "str q18, [%x[out_ptr], #0x10]\n" "str q17, [%x[out_ptr], #0x20]\n" + "sadalp v26.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x30]\n" + "sadalp v25.8h, v16.16b\n" + "add x22, x22, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d19, [x23], #0x8\n" - "ldr d18, [x21], #0x8\n" - "ldr d17, [x20], #0x8\n" - "ldr d16, [x19], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v19.s }[2], [x23], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" - "ld1 { v17.s }[2], [x20], #0x4\n" - "ld1 { v16.s }[2], [x19], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "ld1 { v17.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v19.h }[6], [x23], #0x2\n" - "ld1 { v18.h }[6], [x21], #0x2\n" - "ld1 { v17.h }[6], [x20], #0x2\n" - "ld1 { v16.h }[6], [x19], #0x2\n" + "ld1 { v19.h }[6], [x24], #0x2\n" + "ld1 { v18.h }[6], [x23], #0x2\n" + "ld1 { v17.h }[6], [x21], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[14], [x23]\n" - "ld1 { v18.b }[14], [x21]\n" - "ld1 { v17.b }[14], [x20]\n" - "ld1 { v16.b }[14], [x19]\n" + "ld1 { v19.b }[14], [x24]\n" + "ld1 { v18.b }[14], [x23]\n" + "ld1 { v17.b }[14], [x21]\n" + "ld1 { v16.b }[14], [x20]\n" "b 13f\n" "6:" // odd_loads_1_12 "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[12], [x23]\n" - "ld1 { v18.b }[12], [x21]\n" - "ld1 { v17.b }[12], [x20]\n" - "ld1 { v16.b }[12], [x19]\n" + "ld1 { v19.b }[12], [x24]\n" + "ld1 { v18.b }[12], [x23]\n" + "ld1 { v17.b }[12], [x21]\n" + "ld1 { v16.b }[12], [x20]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v19.h }[4], [x23], #0x2\n" - "ld1 { v18.h }[4], [x21], #0x2\n" - "ld1 { v17.h }[4], [x20], #0x2\n" - "ld1 { v16.h }[4], [x19], #0x2\n" + "ld1 { v19.h }[4], [x24], #0x2\n" + "ld1 { v18.h }[4], [x23], #0x2\n" + "ld1 { v17.h }[4], [x21], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[10], [x23]\n" - "ld1 { v18.b }[10], [x21]\n" - "ld1 { v17.b }[10], [x20]\n" - "ld1 { v16.b }[10], [x19]\n" + "ld1 { v19.b }[10], [x24]\n" + "ld1 { v18.b }[10], [x23]\n" + "ld1 { v17.b }[10], [x21]\n" + "ld1 { v16.b }[10], [x20]\n" "b 13f\n" "8:" // odd_loads_1_8 "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[8], [x23]\n" - "ld1 { v18.b }[8], [x21]\n" - "ld1 { v17.b }[8], [x20]\n" - "ld1 { v16.b }[8], [x19]\n" + "ld1 { v19.b }[8], [x24]\n" + "ld1 { v18.b }[8], [x23]\n" + "ld1 { v17.b }[8], [x21]\n" + "ld1 { v16.b }[8], [x20]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s19, [x23], #0x4\n" - "ldr s18, [x21], #0x4\n" - "ldr s17, [x20], #0x4\n" - "ldr s16, [x19], #0x4\n" + "ldr s19, [x24], #0x4\n" + "ldr s18, [x23], #0x4\n" + "ldr s17, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v19.h }[2], [x23], #0x2\n" - "ld1 { v18.h }[2], [x21], #0x2\n" - "ld1 { v17.h }[2], [x20], #0x2\n" - "ld1 { v16.h }[2], [x19], #0x2\n" + "ld1 { v19.h }[2], [x24], #0x2\n" + "ld1 { v18.h }[2], [x23], #0x2\n" + "ld1 { v17.h }[2], [x21], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[6], [x23]\n" - "ld1 { v18.b }[6], [x21]\n" - "ld1 { v17.b }[6], [x20]\n" - "ld1 { v16.b }[6], [x19]\n" + "ld1 { v19.b }[6], [x24]\n" + "ld1 { v18.b }[6], [x23]\n" + "ld1 { v17.b }[6], [x21]\n" + "ld1 { v16.b }[6], [x20]\n" "b 13f\n" "10:" // odd_loads_1_4 "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[4], [x23]\n" - "ld1 { v18.b }[4], [x21]\n" - "ld1 { v17.b }[4], [x20]\n" - "ld1 { v16.b }[4], [x19]\n" + "ld1 { v19.b }[4], [x24]\n" + "ld1 { v18.b }[4], [x23]\n" + "ld1 { v17.b }[4], [x21]\n" + "ld1 { v16.b }[4], [x20]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h19, [x23], #0x2\n" - "ldr h18, [x21], #0x2\n" - "ldr h17, [x20], #0x2\n" - "ldr h16, [x19], #0x2\n" + "ldr h19, [x24], #0x2\n" + "ldr h18, [x23], #0x2\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[2], [x23]\n" - "ld1 { v18.b }[2], [x21]\n" - "ld1 { v17.b }[2], [x20]\n" - "ld1 { v16.b }[2], [x19]\n" + "ld1 { v19.b }[2], [x24]\n" + "ld1 { v18.b }[2], [x23]\n" + "ld1 { v17.b }[2], [x21]\n" + "ld1 { v16.b }[2], [x20]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b19, [x23, #0x0]\n" - "ldr b18, [x21, #0x0]\n" - "ldr b17, [x20, #0x0]\n" - "ldr b16, [x19, #0x0]\n" + "ldr b19, [x24, #0x0]\n" + "ldr b18, [x23, #0x0]\n" + "ldr b17, [x21, #0x0]\n" + "ldr b16, [x20, #0x0]\n" "13:" // Odd load end "str q19, [%x[out_ptr], #0x0]\n" "sadalp v28.8h, v19.16b\n" - "str q18, [%x[out_ptr], #0x10]\n" "sadalp v27.8h, v18.16b\n" - "str q17, [%x[out_ptr], #0x20]\n" + "str q18, [%x[out_ptr], #0x10]\n" "sadalp v26.8h, v17.16b\n" - "str q16, [%x[out_ptr], #0x30]\n" "sadalp v25.8h, v16.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "14:" // Odds skip "sadalp v24.4s, v28.8h\n" "sadalp v23.4s, v27.8h\n" - "addp v24.4s, v24.4s, v23.4s\n" "sadalp v22.4s, v26.8h\n" "sadalp v21.4s, v25.8h\n" + "addp v24.4s, v24.4s, v23.4s\n" "addp v23.4s, v22.4s, v21.4s\n" "addp v24.4s, v24.4s, v23.4s\n" "add v24.4s, v24.4s, v20.4s\n" @@ -217,7 +217,7 @@ void interleave_block<4, 16, VLType::None, true>( "add %x[out_ptr], %x[out_ptr], #0x10\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp index c81146212c..4c7bb71fb2 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,39 +31,39 @@ void interleave_block<4, 16, VLType::None, true>( ) { __asm__ __volatile__( - "movi v28.8h, #0x0\n" - "ldr x23, [%x[in], #0x0]\n" + "ldr x24, [%x[in], #0x0]\n" + "ldr x23, [%x[in], #0x8]\n" + "cmp %x[height], #0x4\n" "mov x22, #0x0\n" + "ldr x21, [%x[in], #0x10]\n" + "ldr x20, [%x[in], #0x18]\n" + "movi v28.8h, #0x0\n" "movi v27.8h, #0x0\n" - "ldr x21, [%x[in], #0x8]\n" - "cmp %x[height], #0x4\n" "movi v26.8h, #0x0\n" - "ldr x20, [%x[in], #0x10]\n" - "add x23, x23, %x[row_offset]\n" "movi v25.8h, #0x0\n" - "ldr x19, [%x[in], #0x18]\n" + "add x24, x24, %x[row_offset]\n" + "add x23, x23, %x[row_offset]\n" "movi v24.4s, #0x0\n" - "add x21, x21, %x[row_offset]\n" "movi v23.4s, #0x0\n" + "add x21, x21, %x[row_offset]\n" "add x20, x20, %x[row_offset]\n" "movi v22.4s, #0x0\n" - "add x19, x19, %x[row_offset]\n" "movi v21.4s, #0x0\n" "beq 1f\n" - "mov x19, x23\n" "cmp %x[height], #0x2\n" - "csel x21, x21, x23, GE\n" - "csel x20, x20, x23, GT\n" + "mov x20, x24\n" + "csel x23, x23, x24, GE\n" + "csel x21, x21, x24, GT\n" "1:" // no_pointer_adj - "movi v20.4s, #0x0\n" + "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" + "movi v20.4s, #0x0\n" "prfm pldl1keep, [x21, #0x0]\n" "prfm pldl1keep, [x20, #0x0]\n" - "prfm pldl1keep, [x19, #0x0]\n" + "prfm pldl1keep, [x24, #0x40]\n" "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" "prfm pldl1keep, [x20, #0x40]\n" - "prfm pldl1keep, [x19, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x10\n" "ld1 { v20.4s }, [%x[out_ptr]]\n" @@ -75,141 +75,141 @@ void interleave_block<4, 16, VLType::None, true>( "ble 4f\n" "uadalp v24.4s, v28.8h\n" "movi v28.8h, #0x0\n" + "mov x22, #0x0\n" "uadalp v23.4s, v27.8h\n" "movi v27.8h, #0x0\n" "uadalp v22.4s, v26.8h\n" "movi v26.8h, #0x0\n" "uadalp v21.4s, v25.8h\n" "movi v25.8h, #0x0\n" - "mov x22, #0x0\n" "4:" // no_accumulate_16 - "ldr q19, [x23], #0x10\n" - "add x22, x22, #0x1\n" - "ldr q18, [x21], #0x10\n" + "ldr q19, [x24], #0x10\n" + "ldr q18, [x23], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q17, [x20], #0x10\n" "cmp %x[width], #0x10\n" - "ldr q16, [x19], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q19, [%x[out_ptr], #0x0]\n" "uadalp v28.8h, v19.16b\n" + "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" - "prfm pldl1keep, [x21, #0x70]\n" + "str q18, [%x[out_ptr], #0x10]\n" "uadalp v27.8h, v18.16b\n" + "prfm pldl1keep, [x21, #0x70]\n" "prfm pldl1keep, [x20, #0x70]\n" - "uadalp v26.8h, v17.16b\n" - "prfm pldl1keep, [x19, #0x70]\n" - "uadalp v25.8h, v16.16b\n" - "str q19, [%x[out_ptr], #0x0]\n" - "str q18, [%x[out_ptr], #0x10]\n" "str q17, [%x[out_ptr], #0x20]\n" + "uadalp v26.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x30]\n" + "uadalp v25.8h, v16.16b\n" + "add x22, x22, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d19, [x23], #0x8\n" - "ldr d18, [x21], #0x8\n" - "ldr d17, [x20], #0x8\n" - "ldr d16, [x19], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v19.s }[2], [x23], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" - "ld1 { v17.s }[2], [x20], #0x4\n" - "ld1 { v16.s }[2], [x19], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "ld1 { v17.s }[2], [x21], #0x4\n" + "ld1 { v16.s }[2], [x20], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v19.h }[6], [x23], #0x2\n" - "ld1 { v18.h }[6], [x21], #0x2\n" - "ld1 { v17.h }[6], [x20], #0x2\n" - "ld1 { v16.h }[6], [x19], #0x2\n" + "ld1 { v19.h }[6], [x24], #0x2\n" + "ld1 { v18.h }[6], [x23], #0x2\n" + "ld1 { v17.h }[6], [x21], #0x2\n" + "ld1 { v16.h }[6], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[14], [x23]\n" - "ld1 { v18.b }[14], [x21]\n" - "ld1 { v17.b }[14], [x20]\n" - "ld1 { v16.b }[14], [x19]\n" + "ld1 { v19.b }[14], [x24]\n" + "ld1 { v18.b }[14], [x23]\n" + "ld1 { v17.b }[14], [x21]\n" + "ld1 { v16.b }[14], [x20]\n" "b 13f\n" "6:" // odd_loads_1_12 "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[12], [x23]\n" - "ld1 { v18.b }[12], [x21]\n" - "ld1 { v17.b }[12], [x20]\n" - "ld1 { v16.b }[12], [x19]\n" + "ld1 { v19.b }[12], [x24]\n" + "ld1 { v18.b }[12], [x23]\n" + "ld1 { v17.b }[12], [x21]\n" + "ld1 { v16.b }[12], [x20]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v19.h }[4], [x23], #0x2\n" - "ld1 { v18.h }[4], [x21], #0x2\n" - "ld1 { v17.h }[4], [x20], #0x2\n" - "ld1 { v16.h }[4], [x19], #0x2\n" + "ld1 { v19.h }[4], [x24], #0x2\n" + "ld1 { v18.h }[4], [x23], #0x2\n" + "ld1 { v17.h }[4], [x21], #0x2\n" + "ld1 { v16.h }[4], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[10], [x23]\n" - "ld1 { v18.b }[10], [x21]\n" - "ld1 { v17.b }[10], [x20]\n" - "ld1 { v16.b }[10], [x19]\n" + "ld1 { v19.b }[10], [x24]\n" + "ld1 { v18.b }[10], [x23]\n" + "ld1 { v17.b }[10], [x21]\n" + "ld1 { v16.b }[10], [x20]\n" "b 13f\n" "8:" // odd_loads_1_8 "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[8], [x23]\n" - "ld1 { v18.b }[8], [x21]\n" - "ld1 { v17.b }[8], [x20]\n" - "ld1 { v16.b }[8], [x19]\n" + "ld1 { v19.b }[8], [x24]\n" + "ld1 { v18.b }[8], [x23]\n" + "ld1 { v17.b }[8], [x21]\n" + "ld1 { v16.b }[8], [x20]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s19, [x23], #0x4\n" - "ldr s18, [x21], #0x4\n" - "ldr s17, [x20], #0x4\n" - "ldr s16, [x19], #0x4\n" + "ldr s19, [x24], #0x4\n" + "ldr s18, [x23], #0x4\n" + "ldr s17, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v19.h }[2], [x23], #0x2\n" - "ld1 { v18.h }[2], [x21], #0x2\n" - "ld1 { v17.h }[2], [x20], #0x2\n" - "ld1 { v16.h }[2], [x19], #0x2\n" + "ld1 { v19.h }[2], [x24], #0x2\n" + "ld1 { v18.h }[2], [x23], #0x2\n" + "ld1 { v17.h }[2], [x21], #0x2\n" + "ld1 { v16.h }[2], [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[6], [x23]\n" - "ld1 { v18.b }[6], [x21]\n" - "ld1 { v17.b }[6], [x20]\n" - "ld1 { v16.b }[6], [x19]\n" + "ld1 { v19.b }[6], [x24]\n" + "ld1 { v18.b }[6], [x23]\n" + "ld1 { v17.b }[6], [x21]\n" + "ld1 { v16.b }[6], [x20]\n" "b 13f\n" "10:" // odd_loads_1_4 "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[4], [x23]\n" - "ld1 { v18.b }[4], [x21]\n" - "ld1 { v17.b }[4], [x20]\n" - "ld1 { v16.b }[4], [x19]\n" + "ld1 { v19.b }[4], [x24]\n" + "ld1 { v18.b }[4], [x23]\n" + "ld1 { v17.b }[4], [x21]\n" + "ld1 { v16.b }[4], [x20]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h19, [x23], #0x2\n" - "ldr h18, [x21], #0x2\n" - "ldr h17, [x20], #0x2\n" - "ldr h16, [x19], #0x2\n" + "ldr h19, [x24], #0x2\n" + "ldr h18, [x23], #0x2\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v19.b }[2], [x23]\n" - "ld1 { v18.b }[2], [x21]\n" - "ld1 { v17.b }[2], [x20]\n" - "ld1 { v16.b }[2], [x19]\n" + "ld1 { v19.b }[2], [x24]\n" + "ld1 { v18.b }[2], [x23]\n" + "ld1 { v17.b }[2], [x21]\n" + "ld1 { v16.b }[2], [x20]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b19, [x23, #0x0]\n" - "ldr b18, [x21, #0x0]\n" - "ldr b17, [x20, #0x0]\n" - "ldr b16, [x19, #0x0]\n" + "ldr b19, [x24, #0x0]\n" + "ldr b18, [x23, #0x0]\n" + "ldr b17, [x21, #0x0]\n" + "ldr b16, [x20, #0x0]\n" "13:" // Odd load end "str q19, [%x[out_ptr], #0x0]\n" "uadalp v28.8h, v19.16b\n" - "str q18, [%x[out_ptr], #0x10]\n" "uadalp v27.8h, v18.16b\n" - "str q17, [%x[out_ptr], #0x20]\n" + "str q18, [%x[out_ptr], #0x10]\n" "uadalp v26.8h, v17.16b\n" - "str q16, [%x[out_ptr], #0x30]\n" "uadalp v25.8h, v16.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "14:" // Odds skip "uadalp v24.4s, v28.8h\n" "uadalp v23.4s, v27.8h\n" - "addp v24.4s, v24.4s, v23.4s\n" "uadalp v22.4s, v26.8h\n" "uadalp v21.4s, v25.8h\n" + "addp v24.4s, v24.4s, v23.4s\n" "addp v23.4s, v22.4s, v21.4s\n" "addp v24.4s, v24.4s, v23.4s\n" "add v24.4s, v24.4s, v20.4s\n" @@ -217,7 +217,7 @@ void interleave_block<4, 16, VLType::None, true>( "add %x[out_ptr], %x[out_ptr], #0x10\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp index 42574295f1..2ba2aa854a 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,45 +31,46 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "movi v30.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "movi v16.8h, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "add x28, x28, %x[row_offset], LSL #1\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x23, [%x[in], #0x20]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -77,135 +78,134 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d29, [x27], #0x8\n" - "zip1 v29.8h, v30.8h, v29.8h\n" - "ldr d28, [x26], #0x8\n" + "ldr d28, [x28], #0x8\n" + "ldr d27, [x27], #0x8\n" + "shll v28.4s, v28.4h, #0x10\n" + "shll v27.4s, v27.4h, #0x10\n" + "ldr d22, [x26], #0x8\n" + "ldr d21, [x25], #0x8\n" + "shll v22.4s, v22.4h, #0x10\n" + "shll v21.4s, v21.4h, #0x10\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "shll v26.4s, v26.4h, #0x10\n" + "shll v25.4s, v25.4h, #0x10\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "shll v20.4s, v20.4h, #0x10\n" + "shll v19.4s, v19.4h, #0x10\n" + "zip1 v24.4s, v28.4s, v22.4s\n" + "zip1 v23.4s, v27.4s, v21.4s\n" "subs %x[width], %x[width], #0x4\n" - "zip1 v28.8h, v30.8h, v28.8h\n" - "ldr d24, [x25], #0x8\n" "cmp %x[width], #0x4\n" - "zip1 v24.8h, v30.8h, v24.8h\n" - "ldr d27, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "zip1 v25.4s, v29.4s, v24.4s\n" - "zip2 v24.4s, v29.4s, v24.4s\n" - "ldr d23, [x22], #0x8\n" - "ldr d22, [x21], #0x8\n" - "zip1 v27.8h, v30.8h, v27.8h\n" - "ldr d21, [x20], #0x8\n" - "zip1 v26.8h, v30.8h, v26.8h\n" + "zip1 v18.4s, v26.4s, v20.4s\n" + "zip1 v17.4s, v25.4s, v19.4s\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip1 v20.4s, v28.4s, v27.4s\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v23.8h, v30.8h, v23.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v22.8h, v30.8h, v22.8h\n" + "zip2 v20.4s, v26.4s, v20.4s\n" + "zip2 v19.4s, v25.4s, v19.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v21.8h, v30.8h, v21.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v17.4s, v25.4s, v20.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v19.4s, v26.4s, v22.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip1 v18.4s, v23.4s, v21.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v17.4s, v25.4s, v20.4s\n" + "zip1 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" - "str q17, [%x[out_ptr], #0x20]\n" - "zip2 v19.4s, v28.4s, v27.4s\n" - "str q16, [%x[out_ptr], #0x30]\n" - "zip1 v16.4s, v24.4s, v19.4s\n" + "zip2 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q17, [%x[out_ptr], #0x30]\n" + "zip1 v18.4s, v20.4s, v19.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v18.4s, v26.4s, v22.4s\n" - "zip2 v17.4s, v23.4s, v21.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v16.4s, v24.4s, v19.4s\n" - "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.4s, v18.4s, v17.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr s29, [x27], #0x4\n" - "ldr s28, [x26], #0x4\n" - "mov x19, #0x2\n" - "ldr s24, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s23, [x22], #0x4\n" - "ldr s22, [x21], #0x4\n" - "ldr s21, [x20], #0x4\n" + "ldr s28, [x28], #0x4\n" + "ldr s27, [x27], #0x4\n" + "mov x20, #0x2\n" + "ldr s22, [x26], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" "tbz %x[width], #0, 5f\n" - "ld1 { v29.h }[2], [x27]\n" - "mov x19, #0x3\n" - "ld1 { v28.h }[2], [x26]\n" - "ld1 { v24.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x24]\n" - "ld1 { v26.h }[2], [x23]\n" - "ld1 { v23.h }[2], [x22]\n" - "ld1 { v22.h }[2], [x21]\n" - "ld1 { v21.h }[2], [x20]\n" + "ld1 { v28.h }[2], [x28]\n" + "ld1 { v27.h }[2], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v21.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v19.h }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr h29, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr h28, [x26, #0x0]\n" - "ldr h24, [x25, #0x0]\n" - "ldr h27, [x24, #0x0]\n" - "ldr h26, [x23, #0x0]\n" - "ldr h23, [x22, #0x0]\n" - "ldr h22, [x21, #0x0]\n" - "ldr h21, [x20, #0x0]\n" + "ldr h28, [x28, #0x0]\n" + "ldr h27, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr h22, [x26, #0x0]\n" + "ldr h21, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h19, [x21, #0x0]\n" "5:" // Odd load end - "zip1 v29.8h, v30.8h, v29.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v28.8h, v30.8h, v28.8h\n" - "zip1 v24.8h, v30.8h, v24.8h\n" - "zip1 v27.8h, v30.8h, v27.8h\n" - "zip1 v26.8h, v30.8h, v26.8h\n" - "zip1 v23.8h, v30.8h, v23.8h\n" - "zip1 v22.8h, v30.8h, v22.8h\n" - "zip1 v21.8h, v30.8h, v21.8h\n" - "zip1 v25.4s, v29.4s, v24.4s\n" - "zip1 v20.4s, v28.4s, v27.4s\n" - "zip1 v17.4s, v25.4s, v20.4s\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip1 v19.4s, v26.4s, v22.4s\n" - "zip1 v18.4s, v23.4s, v21.4s\n" - "zip1 v16.4s, v19.4s, v18.4s\n" + "shll v28.4s, v28.4h, #0x10\n" + "shll v27.4s, v27.4h, #0x10\n" + "subs x20, x20, #0x1\n" + "shll v22.4s, v22.4h, #0x10\n" + "shll v21.4s, v21.4h, #0x10\n" + "shll v26.4s, v26.4h, #0x10\n" + "shll v25.4s, v25.4h, #0x10\n" + "shll v20.4s, v20.4h, #0x10\n" + "shll v19.4s, v19.4h, #0x10\n" + "zip1 v24.4s, v28.4s, v22.4s\n" + "zip1 v23.4s, v27.4s, v21.4s\n" + "zip1 v18.4s, v26.4s, v20.4s\n" + "zip1 v17.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v17.4s, v25.4s, v20.4s\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" - "subs x19, x19, #0x1\n" - "str q16, [%x[out_ptr], #0x10]\n" + "subs x20, x20, #0x1\n" + "zip2 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v24.4s, v29.4s, v24.4s\n" - "zip2 v19.4s, v28.4s, v27.4s\n" - "zip1 v16.4s, v24.4s, v19.4s\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v20.4s, v26.4s, v20.4s\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v18.4s, v26.4s, v22.4s\n" - "zip2 v17.4s, v23.4s, v21.4s\n" - "zip1 v16.4s, v18.4s, v17.4s\n" - "str q16, [%x[out_ptr], #0x10]\n" + "zip1 v18.4s, v20.4s, v19.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp index 62d1657a9a..f55c2be4a4 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,193 +77,192 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head + "ldr q25, [x28], #0x10\n" "ldr q30, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" - "ldr q29, [x26], #0x10\n" "cmp %x[width], #0x8\n" + "ldr q29, [x26], #0x10\n" "ldr q28, [x25], #0x10\n" - "ldr q27, [x24], #0x10\n" - "ldr q25, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v25.8h\n" - "ldr q21, [x22], #0x10\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "ldr q24, [x21], #0x10\n" - "ldr q23, [x20], #0x10\n" - "zip1 v22.8h, v29.8h, v21.8h\n" + "ldr q21, [x24], #0x10\n" + "ldr q27, [x23], #0x10\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v26.8h, v30.8h, v27.8h\n" + "ldr q20, [x22], #0x10\n" + "ldr q22, [x21], #0x10\n" + "zip1 v19.8h, v29.8h, v20.8h\n" + "zip1 v18.8h, v28.8h, v22.8h\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v22.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.8h, v26.8h, v20.8h\n" + "zip1 v24.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v26.8h, v18.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v16.8h, v22.8h, v19.8h\n" + "zip2 v23.8h, v23.8h, v19.8h\n" + "zip2 v19.8h, v26.8h, v18.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v18.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "zip2 v16.8h, v21.8h, v20.8h\n" + "str q19, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" + "ldr d25, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" - "ldr d24, [x21], #0x8\n" - "ldr d23, [x20], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" + "ld1 { v25.s }[2], [x28], #0x4\n" "ld1 { v30.s }[2], [x27], #0x4\n" - "mov x19, #0x6\n" + "mov x20, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" - "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v21.s }[2], [x22], #0x4\n" - "ld1 { v24.s }[2], [x21], #0x4\n" - "ld1 { v23.s }[2], [x20], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "ld1 { v22.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" + "ld1 { v25.h }[6], [x28]\n" "ld1 { v30.h }[6], [x27]\n" - "mov x19, #0x7\n" + "mov x20, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" - "ld1 { v27.h }[6], [x24]\n" - "ld1 { v25.h }[6], [x23]\n" - "ld1 { v21.h }[6], [x22]\n" - "ld1 { v24.h }[6], [x21]\n" - "ld1 { v23.h }[6], [x20]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v27.h }[6], [x23]\n" + "ld1 { v20.h }[6], [x22]\n" + "ld1 { v22.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" + "ld1 { v25.h }[4], [x28]\n" "ld1 { v30.h }[4], [x27]\n" + "mov x20, #0x5\n" "ld1 { v29.h }[4], [x26]\n" - "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" - "ld1 { v27.h }[4], [x24]\n" - "ld1 { v25.h }[4], [x23]\n" - "ld1 { v21.h }[4], [x22]\n" - "ld1 { v24.h }[4], [x21]\n" - "ld1 { v23.h }[4], [x20]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v27.h }[4], [x23]\n" + "ld1 { v20.h }[4], [x22]\n" + "ld1 { v22.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" + "ldr s25, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" + "mov x20, #0x2\n" "ldr s29, [x26], #0x4\n" - "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s24, [x21], #0x4\n" - "ldr s23, [x20], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s22, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" + "ld1 { v25.h }[2], [x28]\n" "ld1 { v30.h }[2], [x27]\n" - "mov x19, #0x3\n" + "mov x20, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v21.h }[2], [x22]\n" - "ld1 { v24.h }[2], [x21]\n" - "ld1 { v23.h }[2], [x20]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v27.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v22.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 + "ldr h25, [x28, #0x0]\n" "ldr h30, [x27, #0x0]\n" - "mov x19, #0x1\n" + "mov x20, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" - "ldr h27, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h21, [x22, #0x0]\n" - "ldr h24, [x21, #0x0]\n" - "ldr h23, [x20, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h22, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v26.8h, v30.8h, v25.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v26.8h, v20.8h\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "zip1 v19.8h, v27.8h, v23.8h\n" - "zip1 v16.8h, v22.8h, v19.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v19.8h, v29.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v26.8h, v30.8h, v27.8h\n" + "zip1 v18.8h, v28.8h, v22.8h\n" + "zip1 v24.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v26.8h, v18.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v16.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v23.8h, v23.8h, v19.8h\n" + "zip2 v19.8h, v26.8h, v18.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v23.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v22.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "str q19, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" + "zip2 v18.8h, v22.8h, v18.8h\n" + "str q18, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp index b67840b280..f64db0b476 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,135 +77,134 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d30, [x27], #0x8\n" - "subs %x[width], %x[width], #0x4\n" - "ldr d29, [x26], #0x8\n" - "cmp %x[width], #0x4\n" - "ldr d28, [x25], #0x8\n" - "fcvtl v30.4s, v30.4h\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "fcvtl v29.4s, v29.4h\n" - "ldr d26, [x22], #0x8\n" + "ldr d28, [x28], #0x8\n" + "ldr d27, [x27], #0x8\n" "fcvtl v28.4s, v28.4h\n" - "zip1 v20.4s, v30.4s, v28.4s\n" - "ldr d25, [x21], #0x8\n" - "fcvtl v21.4s, v21.4h\n" - "zip2 v17.4s, v30.4s, v28.4s\n" - "ldr d24, [x20], #0x8\n" "fcvtl v27.4s, v27.4h\n" - "zip1 v18.4s, v29.4s, v21.4s\n" - "prfm pldl1keep, [x27, #0x70]\n" + "ldr d22, [x26], #0x8\n" + "ldr d21, [x25], #0x8\n" + "fcvtl v22.4s, v22.4h\n" + "fcvtl v21.4s, v21.4h\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" "fcvtl v26.4s, v26.4h\n" - "zip1 v23.4s, v20.4s, v18.4s\n" - "prfm pldl1keep, [x26, #0x70]\n" "fcvtl v25.4s, v25.4h\n" - "zip2 v22.4s, v20.4s, v18.4s\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" + "fcvtl v20.4s, v20.4h\n" + "fcvtl v19.4s, v19.4h\n" + "zip1 v24.4s, v28.4s, v22.4s\n" + "zip1 v23.4s, v27.4s, v21.4s\n" + "subs %x[width], %x[width], #0x4\n" + "cmp %x[width], #0x4\n" + "zip1 v18.4s, v26.4s, v20.4s\n" + "zip1 v17.4s, v25.4s, v19.4s\n" + "prfm pldl1keep, [x28, #0x70]\n" + "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "fcvtl v24.4s, v24.4h\n" - "zip2 v16.4s, v29.4s, v21.4s\n" + "zip2 v20.4s, v26.4s, v20.4s\n" + "zip2 v19.4s, v25.4s, v19.4s\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v21.4s, v17.4s, v16.4s\n" - "zip2 v20.4s, v17.4s, v16.4s\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip1 v19.4s, v27.4s, v25.4s\n" - "zip2 v18.4s, v27.4s, v25.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip1 v17.4s, v26.4s, v24.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v19.4s, v17.4s\n" + "zip1 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v17.4s, v19.4s, v17.4s\n" - "str q22, [%x[out_ptr], #0x20]\n" - "zip2 v16.4s, v26.4s, v24.4s\n" + "zip2 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v17.4s, v18.4s, v17.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" "str q17, [%x[out_ptr], #0x30]\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q21, [%x[out_ptr], #0x40]\n" - "zip2 v16.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x50]\n" - "str q20, [%x[out_ptr], #0x60]\n" + "zip1 v18.4s, v20.4s, v19.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "mov x19, #0x2\n" - "ldr s28, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s26, [x22], #0x4\n" - "ldr s25, [x21], #0x4\n" - "ldr s24, [x20], #0x4\n" + "ldr s28, [x28], #0x4\n" + "ldr s27, [x27], #0x4\n" + "mov x20, #0x2\n" + "ldr s22, [x26], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" "tbz %x[width], #0, 5f\n" - "ld1 { v30.h }[2], [x27]\n" - "mov x19, #0x3\n" - "ld1 { v29.h }[2], [x26]\n" - "ld1 { v28.h }[2], [x25]\n" - "ld1 { v21.h }[2], [x24]\n" - "ld1 { v27.h }[2], [x23]\n" - "ld1 { v26.h }[2], [x22]\n" - "ld1 { v25.h }[2], [x21]\n" - "ld1 { v24.h }[2], [x20]\n" + "ld1 { v28.h }[2], [x28]\n" + "ld1 { v27.h }[2], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v21.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v19.h }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr h30, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr h29, [x26, #0x0]\n" - "ldr h28, [x25, #0x0]\n" - "ldr h21, [x24, #0x0]\n" - "ldr h27, [x23, #0x0]\n" - "ldr h26, [x22, #0x0]\n" - "ldr h25, [x21, #0x0]\n" - "ldr h24, [x20, #0x0]\n" + "ldr h28, [x28, #0x0]\n" + "ldr h27, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr h22, [x26, #0x0]\n" + "ldr h21, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h19, [x21, #0x0]\n" "5:" // Odd load end - "fcvtl v30.4s, v30.4h\n" - "fcvtl v29.4s, v29.4h\n" "fcvtl v28.4s, v28.4h\n" - "zip1 v20.4s, v30.4s, v28.4s\n" - "fcvtl v21.4s, v21.4h\n" "fcvtl v27.4s, v27.4h\n" - "zip1 v18.4s, v29.4s, v21.4s\n" + "subs x20, x20, #0x1\n" + "fcvtl v22.4s, v22.4h\n" + "fcvtl v21.4s, v21.4h\n" "fcvtl v26.4s, v26.4h\n" "fcvtl v25.4s, v25.4h\n" - "zip1 v23.4s, v20.4s, v18.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v19.4s, v27.4s, v25.4s\n" - "fcvtl v24.4s, v24.4h\n" - "subs x19, x19, #0x1\n" - "zip1 v17.4s, v26.4s, v24.4s\n" - "zip1 v16.4s, v19.4s, v17.4s\n" + "fcvtl v20.4s, v20.4h\n" + "fcvtl v19.4s, v19.4h\n" + "zip1 v24.4s, v28.4s, v22.4s\n" + "zip1 v23.4s, v27.4s, v21.4s\n" + "zip1 v18.4s, v26.4s, v20.4s\n" + "zip1 v17.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v22.4s, v20.4s, v18.4s\n" - "str q22, [%x[out_ptr], #0x0]\n" - "zip2 v17.4s, v19.4s, v17.4s\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" + "zip2 v16.4s, v24.4s, v23.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v17.4s, v18.4s, v17.4s\n" "str q17, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v17.4s, v30.4s, v28.4s\n" - "zip2 v16.4s, v29.4s, v21.4s\n" - "zip1 v21.4s, v17.4s, v16.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v18.4s, v27.4s, v25.4s\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v20.4s, v26.4s, v20.4s\n" + "zip2 v19.4s, v25.4s, v19.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.4s, v20.4s, v19.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp index eefb8549ea..6c009b34b8 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #2\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #2\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #2\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #2\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #2\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #2\n" "add x22, x22, %x[row_offset], LSL #2\n" "add x21, x21, %x[row_offset], LSL #2\n" - "add x20, x20, %x[row_offset], LSL #2\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,49 +77,48 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q28, [x27], #0x10\n" + "ldr q28, [x28], #0x10\n" + "ldr q27, [x27], #0x10\n" "subs %x[width], %x[width], #0x4\n" - "ldr q29, [x26], #0x10\n" "cmp %x[width], #0x4\n" - "ldr q25, [x25], #0x10\n" - "zip1 v22.4s, v28.4s, v25.4s\n" - "ldr q21, [x24], #0x10\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "ldr q27, [x23], #0x10\n" - "ldr q26, [x22], #0x10\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "ldr q19, [x21], #0x10\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "ldr q24, [x20], #0x10\n" - "zip1 v23.4s, v22.4s, v20.4s\n" + "ldr q22, [x26], #0x10\n" + "ldr q21, [x25], #0x10\n" + "zip1 v26.4s, v28.4s, v22.4s\n" + "zip1 v25.4s, v27.4s, v21.4s\n" + "ldr q24, [x24], #0x10\n" + "ldr q23, [x23], #0x10\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "ldr q19, [x22], #0x10\n" + "ldr q18, [x21], #0x10\n" + "zip1 v20.4s, v24.4s, v19.4s\n" + "zip1 v17.4s, v23.4s, v18.4s\n" + "zip2 v19.4s, v24.4s, v19.4s\n" + "zip2 v18.4s, v23.4s, v18.4s\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v17.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v17.4s, v28.4s, v25.4s\n" - "str q22, [%x[out_ptr], #0x20]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" - "str q21, [%x[out_ptr], #0x40]\n" - "str q18, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" @@ -126,69 +126,69 @@ void interleave_block<8, 1, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "mov x19, #0x2\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d26, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d24, [x20], #0x8\n" + "ldr d28, [x28], #0x8\n" + "ldr d27, [x27], #0x8\n" + "mov x20, #0x2\n" + "ldr d22, [x26], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d24, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #0, 5f\n" - "ld1 { v28.s }[2], [x27]\n" - "mov x19, #0x3\n" - "ld1 { v29.s }[2], [x26]\n" - "ld1 { v25.s }[2], [x25]\n" - "ld1 { v21.s }[2], [x24]\n" - "ld1 { v27.s }[2], [x23]\n" - "ld1 { v26.s }[2], [x22]\n" - "ld1 { v19.s }[2], [x21]\n" - "ld1 { v24.s }[2], [x20]\n" + "ld1 { v28.s }[2], [x28]\n" + "ld1 { v27.s }[2], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v22.s }[2], [x26]\n" + "ld1 { v21.s }[2], [x25]\n" + "ld1 { v24.s }[2], [x24]\n" + "ld1 { v23.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v18.s }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr s28, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr s29, [x26, #0x0]\n" - "ldr s25, [x25, #0x0]\n" - "ldr s21, [x24, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "ldr s26, [x22, #0x0]\n" - "ldr s19, [x21, #0x0]\n" - "ldr s24, [x20, #0x0]\n" + "ldr s28, [x28, #0x0]\n" + "ldr s27, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr s22, [x26, #0x0]\n" + "ldr s21, [x25, #0x0]\n" + "ldr s24, [x24, #0x0]\n" + "ldr s23, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s18, [x21, #0x0]\n" "5:" // Odd load end - "zip1 v22.4s, v28.4s, v25.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "zip1 v23.4s, v22.4s, v20.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v27.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v24.4s\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" + "zip1 v26.4s, v28.4s, v22.4s\n" + "zip1 v25.4s, v27.4s, v21.4s\n" + "subs x20, x20, #0x1\n" + "zip1 v20.4s, v24.4s, v19.4s\n" + "zip1 v17.4s, v23.4s, v18.4s\n" + "zip1 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v22.4s, v22.4s, v20.4s\n" - "str q22, [%x[out_ptr], #0x0]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" - "subs x19, x19, #0x1\n" - "str q20, [%x[out_ptr], #0x10]\n" + "subs x20, x20, #0x1\n" + "zip2 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "zip1 v21.4s, v28.4s, v25.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v19.4s, v24.4s, v19.4s\n" + "zip2 v18.4s, v23.4s, v18.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp index b0523b96ce..767d468ad1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,193 +77,192 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head + "ldr q25, [x28], #0x10\n" "ldr q30, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" - "ldr q29, [x26], #0x10\n" "cmp %x[width], #0x8\n" + "ldr q29, [x26], #0x10\n" "ldr q28, [x25], #0x10\n" - "ldr q27, [x24], #0x10\n" - "ldr q25, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v25.8h\n" - "ldr q21, [x22], #0x10\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "ldr q24, [x21], #0x10\n" - "ldr q23, [x20], #0x10\n" - "zip1 v22.8h, v29.8h, v21.8h\n" + "ldr q21, [x24], #0x10\n" + "ldr q27, [x23], #0x10\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v26.8h, v30.8h, v27.8h\n" + "ldr q20, [x22], #0x10\n" + "ldr q22, [x21], #0x10\n" + "zip1 v19.8h, v29.8h, v20.8h\n" + "zip1 v18.8h, v28.8h, v22.8h\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v21.8h, v29.8h, v21.8h\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v22.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.8h, v26.8h, v20.8h\n" + "zip1 v24.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v26.8h, v18.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v16.8h, v22.8h, v19.8h\n" + "zip2 v23.8h, v23.8h, v19.8h\n" + "zip2 v19.8h, v26.8h, v18.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v17.8h, v22.8h, v19.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v18.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "zip2 v16.8h, v21.8h, v20.8h\n" + "str q19, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" + "ldr d25, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" - "ldr d24, [x21], #0x8\n" - "ldr d23, [x20], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" + "ld1 { v25.s }[2], [x28], #0x4\n" "ld1 { v30.s }[2], [x27], #0x4\n" - "mov x19, #0x6\n" + "mov x20, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" - "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v21.s }[2], [x22], #0x4\n" - "ld1 { v24.s }[2], [x21], #0x4\n" - "ld1 { v23.s }[2], [x20], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "ld1 { v22.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" + "ld1 { v25.h }[6], [x28]\n" "ld1 { v30.h }[6], [x27]\n" - "mov x19, #0x7\n" + "mov x20, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" - "ld1 { v27.h }[6], [x24]\n" - "ld1 { v25.h }[6], [x23]\n" - "ld1 { v21.h }[6], [x22]\n" - "ld1 { v24.h }[6], [x21]\n" - "ld1 { v23.h }[6], [x20]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v27.h }[6], [x23]\n" + "ld1 { v20.h }[6], [x22]\n" + "ld1 { v22.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" + "ld1 { v25.h }[4], [x28]\n" "ld1 { v30.h }[4], [x27]\n" + "mov x20, #0x5\n" "ld1 { v29.h }[4], [x26]\n" - "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" - "ld1 { v27.h }[4], [x24]\n" - "ld1 { v25.h }[4], [x23]\n" - "ld1 { v21.h }[4], [x22]\n" - "ld1 { v24.h }[4], [x21]\n" - "ld1 { v23.h }[4], [x20]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v27.h }[4], [x23]\n" + "ld1 { v20.h }[4], [x22]\n" + "ld1 { v22.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" + "ldr s25, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" + "mov x20, #0x2\n" "ldr s29, [x26], #0x4\n" - "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s24, [x21], #0x4\n" - "ldr s23, [x20], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s22, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" + "ld1 { v25.h }[2], [x28]\n" "ld1 { v30.h }[2], [x27]\n" - "mov x19, #0x3\n" + "mov x20, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v21.h }[2], [x22]\n" - "ld1 { v24.h }[2], [x21]\n" - "ld1 { v23.h }[2], [x20]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v27.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v22.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 + "ldr h25, [x28, #0x0]\n" "ldr h30, [x27, #0x0]\n" - "mov x19, #0x1\n" + "mov x20, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" - "ldr h27, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h21, [x22, #0x0]\n" - "ldr h24, [x21, #0x0]\n" - "ldr h23, [x20, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h22, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v26.8h, v30.8h, v25.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v26.8h, v20.8h\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "zip1 v19.8h, v27.8h, v23.8h\n" - "zip1 v16.8h, v22.8h, v19.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v19.8h, v29.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v26.8h, v30.8h, v27.8h\n" + "zip1 v18.8h, v28.8h, v22.8h\n" + "zip1 v24.8h, v23.8h, v19.8h\n" + "zip1 v17.8h, v26.8h, v18.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v16.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v23.8h, v23.8h, v19.8h\n" + "zip2 v19.8h, v26.8h, v18.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v23.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v22.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "str q19, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" + "zip2 v18.8h, v22.8h, v18.8h\n" + "str q18, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp index 292a38f401..a73792036a 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>( ) { __asm__ __volatile__( - "movi v1.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" - "movi v0.4s, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "movi v31.4s, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "movi v2.8h, #0x0\n" + "movi v1.4s, #0x0\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" + "movi v0.4s, #0x0\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x24, [%x[in], #0x18]\n" - "ldr x23, [%x[in], #0x20]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -79,226 +80,225 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" - "ld1 { v0.4s }, [%x[out_ptr]]\n" - "ldr q31, [%x[out_ptr], #0x10]\n" + "ld1 { v1.4s }, [%x[out_ptr]]\n" + "ldr q0, [%x[out_ptr], #0x10]\n" "2:" // first_pass "cmp %x[width], #0x8\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0xe\n" + "cmp x20, #0xe\n" "ble 4f\n" - "saddw v0.4s, v0.4s, v1.4h\n" - "saddw2 v31.4s, v31.4s, v1.8h\n" - "mov x19, #0x0\n" - "movi v1.8h, #0x0\n" + "saddw v1.4s, v1.4s, v2.4h\n" + "saddw2 v0.4s, v0.4s, v2.8h\n" + "mov x20, #0x0\n" + "movi v2.8h, #0x0\n" "4:" // no_accumulate_16 + "ldr q31, [x28], #0x10\n" "ldr q30, [x27], #0x10\n" - "add x19, x19, #0x1\n" - "ldr q29, [x26], #0x10\n" "subs %x[width], %x[width], #0x8\n" - "ldr q28, [x25], #0x10\n" "cmp %x[width], #0x8\n" + "ldr q29, [x26], #0x10\n" + "ldr q28, [x25], #0x10\n" + "add x20, x20, #0x1\n" "ldr q27, [x24], #0x10\n" - "ldr q25, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v25.8h\n" - "ldr q21, [x22], #0x10\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "ldr q24, [x21], #0x10\n" - "ldr q23, [x20], #0x10\n" - "zip1 v22.8h, v29.8h, v21.8h\n" + "ldr q26, [x23], #0x10\n" + "zip1 v25.8h, v31.8h, v27.8h\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "ldr q24, [x22], #0x10\n" + "ldr q23, [x21], #0x10\n" + "zip1 v18.8h, v29.8h, v24.8h\n" + "zip1 v21.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v21.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v21.8h, v29.8h, v21.8h\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "add v2.8h, v2.8h, v20.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.8h, v26.8h, v20.8h\n" + "zip2 v19.8h, v17.8h, v16.8h\n" + "zip2 v18.8h, v25.8h, v18.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v16.8h, v22.8h, v19.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "add v2.8h, v2.8h, v19.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "add v1.8h, v1.8h, v17.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "str q16, [%x[out_ptr], #0x10]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "str q20, [%x[out_ptr], #0x0]\n" + "zip2 v21.8h, v29.8h, v24.8h\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "str q19, [%x[out_ptr], #0x10]\n" + "zip2 v19.8h, v28.8h, v23.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x20]\n" - "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x40]\n" - "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x60]\n" - "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x70]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" - "add v1.8h, v1.8h, v16.8h\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" + "ldr d31, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" "ldr d27, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" - "ldr d24, [x21], #0x8\n" - "ldr d23, [x20], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" "tbz %x[width], #1, 6f\n" + "ld1 { v31.s }[2], [x28], #0x4\n" "ld1 { v30.s }[2], [x27], #0x4\n" - "mov x19, #0x6\n" + "mov x20, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v21.s }[2], [x22], #0x4\n" - "ld1 { v24.s }[2], [x21], #0x4\n" - "ld1 { v23.s }[2], [x20], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.h }[6], [x28]\n" "ld1 { v30.h }[6], [x27]\n" - "mov x19, #0x7\n" + "mov x20, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" "ld1 { v27.h }[6], [x24]\n" - "ld1 { v25.h }[6], [x23]\n" - "ld1 { v21.h }[6], [x22]\n" - "ld1 { v24.h }[6], [x21]\n" - "ld1 { v23.h }[6], [x20]\n" + "ld1 { v26.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.h }[4], [x28]\n" "ld1 { v30.h }[4], [x27]\n" + "mov x20, #0x5\n" "ld1 { v29.h }[4], [x26]\n" - "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" "ld1 { v27.h }[4], [x24]\n" - "ld1 { v25.h }[4], [x23]\n" - "ld1 { v21.h }[4], [x22]\n" - "ld1 { v24.h }[4], [x21]\n" - "ld1 { v23.h }[4], [x20]\n" + "ld1 { v26.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" + "ldr s31, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" + "mov x20, #0x2\n" "ldr s29, [x26], #0x4\n" - "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s24, [x21], #0x4\n" - "ldr s23, [x20], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.h }[2], [x28]\n" "ld1 { v30.h }[2], [x27]\n" - "mov x19, #0x3\n" + "mov x20, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" "ld1 { v27.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v21.h }[2], [x22]\n" - "ld1 { v24.h }[2], [x21]\n" - "ld1 { v23.h }[2], [x20]\n" + "ld1 { v26.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 + "ldr h31, [x28, #0x0]\n" "ldr h30, [x27, #0x0]\n" - "mov x19, #0x1\n" + "mov x20, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" "ldr h27, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h21, [x22, #0x0]\n" - "ldr h24, [x21, #0x0]\n" - "ldr h23, [x20, #0x0]\n" + "ldr h26, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" "9:" // Odd load end - "zip1 v26.8h, v30.8h, v25.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v26.8h, v20.8h\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "zip1 v19.8h, v27.8h, v23.8h\n" - "zip1 v16.8h, v22.8h, v19.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v25.8h, v31.8h, v27.8h\n" + "zip1 v18.8h, v29.8h, v24.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v21.8h\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "str q20, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v20.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip2 v19.8h, v17.8h, v16.8h\n" + "subs x20, x20, #0x1\n" + "str q19, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v19.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v18.8h, v25.8h, v18.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "subs x20, x20, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "zip2 v21.8h, v29.8h, v24.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "zip2 v19.8h, v28.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "10:" // Odds skip - "saddw v0.4s, v0.4s, v1.4h\n" - "str q0, [%x[out_ptr], #0x0]\n" - "saddw2 v31.4s, v31.4s, v1.8h\n" - "str q31, [%x[out_ptr], #0x10]\n" + "saddw v1.4s, v1.4s, v2.4h\n" + "saddw2 v0.4s, v0.4s, v2.8h\n" + "str q1, [%x[out_ptr], #0x0]\n" + "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp index 6cfed8f3a4..4a38187638 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset]\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset]\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset]\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,209 +77,208 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d31, [x27], #0x8\n" - "sshll v31.8h, v31.8b, #0x0\n" - "ldr d30, [x26], #0x8\n" - "subs %x[width], %x[width], #0x8\n" + "ldr d25, [x28], #0x8\n" + "ldr d30, [x27], #0x8\n" + "sshll v25.8h, v25.8b, #0x0\n" "sshll v30.8h, v30.8b, #0x0\n" - "ldr d29, [x25], #0x8\n" - "cmp %x[width], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" "sshll v29.8h, v29.8b, #0x0\n" - "ldr d28, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" "sshll v28.8h, v28.8b, #0x0\n" - "ldr d23, [x22], #0x8\n" - "sshll v25.8h, v25.8b, #0x0\n" - "ldr d27, [x21], #0x8\n" - "zip1 v20.8h, v31.8h, v25.8h\n" - "ldr d26, [x20], #0x8\n" - "zip2 v25.8h, v31.8h, v25.8h\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "sshll v21.8h, v21.8b, #0x0\n" + "sshll v27.8h, v27.8b, #0x0\n" + "ldr d20, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "sshll v20.8h, v20.8b, #0x0\n" + "sshll v26.8h, v26.8b, #0x0\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v22.8h, v29.8h, v20.8h\n" + "subs %x[width], %x[width], #0x8\n" + "cmp %x[width], #0x8\n" + "zip1 v19.8h, v30.8h, v27.8h\n" + "zip1 v18.8h, v28.8h, v26.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "sshll v23.8h, v23.8b, #0x0\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v24.8h, v30.8h, v23.8h\n" - "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v26.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "sshll v27.8h, v27.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v19.8h, v29.8h, v27.8h\n" + "zip1 v24.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v19.8h, v18.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v21.8h, v20.8h, v19.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v19.8h, v29.8h, v27.8h\n" - "zip1 v20.8h, v25.8h, v19.8h\n" - "zip2 v19.8h, v25.8h, v19.8h\n" - "sshll v26.8h, v26.8b, #0x0\n" - "zip1 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "zip2 v23.8h, v23.8h, v22.8h\n" + "zip2 v19.8h, v19.8h, v18.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.8h, v22.8h, v17.8h\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v17.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v21.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v21.8h, v17.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v18.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v19.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.8h, v19.8h, v17.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "zip2 v16.8h, v21.8h, v20.8h\n" + "str q19, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr s31, [x27], #0x4\n" - "ldr s30, [x26], #0x4\n" - "ldr s29, [x25], #0x4\n" - "ldr s28, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s23, [x22], #0x4\n" - "ldr s27, [x21], #0x4\n" - "ldr s26, [x20], #0x4\n" + "ldr s25, [x28], #0x4\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v31.h }[2], [x27], #0x2\n" - "mov x19, #0x6\n" - "ld1 { v30.h }[2], [x26], #0x2\n" - "ld1 { v29.h }[2], [x25], #0x2\n" - "ld1 { v28.h }[2], [x24], #0x2\n" - "ld1 { v25.h }[2], [x23], #0x2\n" - "ld1 { v23.h }[2], [x22], #0x2\n" - "ld1 { v27.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v25.h }[2], [x28], #0x2\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "mov x20, #0x6\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "ld1 { v20.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v31.b }[6], [x27]\n" - "mov x19, #0x7\n" - "ld1 { v30.b }[6], [x26]\n" - "ld1 { v29.b }[6], [x25]\n" - "ld1 { v28.b }[6], [x24]\n" - "ld1 { v25.b }[6], [x23]\n" - "ld1 { v23.b }[6], [x22]\n" - "ld1 { v27.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" + "ld1 { v25.b }[6], [x28]\n" + "ld1 { v30.b }[6], [x27]\n" + "mov x20, #0x7\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v27.b }[6], [x23]\n" + "ld1 { v20.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v31.b }[4], [x27]\n" - "ld1 { v30.b }[4], [x26]\n" - "mov x19, #0x5\n" - "ld1 { v29.b }[4], [x25]\n" - "ld1 { v28.b }[4], [x24]\n" - "ld1 { v25.b }[4], [x23]\n" - "ld1 { v23.b }[4], [x22]\n" - "ld1 { v27.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" + "ld1 { v25.b }[4], [x28]\n" + "ld1 { v30.b }[4], [x27]\n" + "mov x20, #0x5\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v27.b }[4], [x23]\n" + "ld1 { v20.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr h31, [x27], #0x2\n" - "ldr h30, [x26], #0x2\n" - "mov x19, #0x2\n" - "ldr h29, [x25], #0x2\n" - "ldr h28, [x24], #0x2\n" - "ldr h25, [x23], #0x2\n" - "ldr h23, [x22], #0x2\n" - "ldr h27, [x21], #0x2\n" - "ldr h26, [x20], #0x2\n" + "ldr h25, [x28], #0x2\n" + "ldr h30, [x27], #0x2\n" + "mov x20, #0x2\n" + "ldr h29, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h27, [x23], #0x2\n" + "ldr h20, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v31.b }[2], [x27]\n" - "mov x19, #0x3\n" - "ld1 { v30.b }[2], [x26]\n" - "ld1 { v29.b }[2], [x25]\n" - "ld1 { v28.b }[2], [x24]\n" - "ld1 { v25.b }[2], [x23]\n" - "ld1 { v23.b }[2], [x22]\n" - "ld1 { v27.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" + "ld1 { v25.b }[2], [x28]\n" + "ld1 { v30.b }[2], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v27.b }[2], [x23]\n" + "ld1 { v20.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr b31, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b30, [x26, #0x0]\n" - "ldr b29, [x25, #0x0]\n" - "ldr b28, [x24, #0x0]\n" - "ldr b25, [x23, #0x0]\n" - "ldr b23, [x22, #0x0]\n" - "ldr b27, [x21, #0x0]\n" - "ldr b26, [x20, #0x0]\n" + "ldr b25, [x28, #0x0]\n" + "ldr b30, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b29, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b27, [x23, #0x0]\n" + "ldr b20, [x22, #0x0]\n" + "ldr b26, [x21, #0x0]\n" "7:" // Odd load end - "sshll v31.8h, v31.8b, #0x0\n" - "subs x19, x19, #0x1\n" + "sshll v25.8h, v25.8b, #0x0\n" "sshll v30.8h, v30.8b, #0x0\n" + "subs x20, x20, #0x1\n" "sshll v29.8h, v29.8b, #0x0\n" "sshll v28.8h, v28.8b, #0x0\n" - "sshll v25.8h, v25.8b, #0x0\n" - "zip1 v20.8h, v31.8h, v25.8h\n" - "sshll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" + "sshll v21.8h, v21.8b, #0x0\n" "sshll v27.8h, v27.8b, #0x0\n" - "zip1 v19.8h, v29.8h, v27.8h\n" - "zip1 v22.8h, v20.8h, v19.8h\n" + "sshll v20.8h, v20.8b, #0x0\n" "sshll v26.8h, v26.8b, #0x0\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v22.8h, v29.8h, v20.8h\n" + "zip1 v19.8h, v30.8h, v27.8h\n" "zip1 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "zip1 v24.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v19.8h, v18.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v22.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v21.8h, v20.8h, v19.8h\n" - "zip2 v17.8h, v24.8h, v18.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v16.8h, v21.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v23.8h, v23.8h, v22.8h\n" + "zip2 v19.8h, v19.8h, v18.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v21.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v23.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v31.8h, v25.8h\n" - "zip2 v19.8h, v29.8h, v27.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v20.8h, v25.8h, v19.8h\n" - "zip2 v23.8h, v30.8h, v23.8h\n" - "zip2 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v26.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "str q19, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" + "zip2 v18.8h, v22.8h, v18.8h\n" + "str q18, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v19.8h, v25.8h, v19.8h\n" - "zip2 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v19.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp index b710861417..3ad103c8d4 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>( ) { __asm__ __volatile__( - "movi v1.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" - "movi v0.4s, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "movi v31.4s, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "movi v2.8h, #0x0\n" + "movi v1.4s, #0x0\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" + "movi v0.4s, #0x0\n" + "add x28, x28, %x[row_offset]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x27, x27, %x[row_offset]\n" - "ldr x24, [%x[in], #0x18]\n" - "ldr x23, [%x[in], #0x20]\n" "add x26, x26, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -79,242 +80,241 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" - "ld1 { v0.4s }, [%x[out_ptr]]\n" - "ldr q31, [%x[out_ptr], #0x10]\n" + "ld1 { v1.4s }, [%x[out_ptr]]\n" + "ldr q0, [%x[out_ptr], #0x10]\n" "2:" // first_pass "cmp %x[width], #0x8\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0xe\n" + "cmp x20, #0xe\n" "ble 4f\n" - "saddw v0.4s, v0.4s, v1.4h\n" - "saddw2 v31.4s, v31.4s, v1.8h\n" - "mov x19, #0x0\n" - "movi v1.8h, #0x0\n" + "saddw v1.4s, v1.4s, v2.4h\n" + "saddw2 v0.4s, v0.4s, v2.8h\n" + "mov x20, #0x0\n" + "movi v2.8h, #0x0\n" "4:" // no_accumulate_16 + "ldr d31, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" + "sshll v31.8h, v31.8b, #0x0\n" "sshll v30.8h, v30.8b, #0x0\n" "ldr d29, [x26], #0x8\n" - "add x19, x19, #0x1\n" - "sshll v29.8h, v29.8b, #0x0\n" "ldr d28, [x25], #0x8\n" - "subs %x[width], %x[width], #0x8\n" + "sshll v29.8h, v29.8b, #0x0\n" "sshll v28.8h, v28.8b, #0x0\n" "ldr d27, [x24], #0x8\n" - "cmp %x[width], #0x8\n" + "ldr d26, [x23], #0x8\n" "sshll v27.8h, v27.8b, #0x0\n" - "ldr d24, [x23], #0x8\n" - "ldr d23, [x22], #0x8\n" + "sshll v26.8h, v26.8b, #0x0\n" + "ldr d25, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "sshll v25.8h, v25.8b, #0x0\n" "sshll v24.8h, v24.8b, #0x0\n" - "ldr d21, [x21], #0x8\n" - "sshll v23.8h, v23.8b, #0x0\n" - "ldr d26, [x20], #0x8\n" - "zip1 v20.8h, v30.8h, v24.8h\n" + "zip1 v23.8h, v31.8h, v27.8h\n" + "zip1 v22.8h, v29.8h, v25.8h\n" + "subs %x[width], %x[width], #0x8\n" + "cmp %x[width], #0x8\n" + "zip1 v21.8h, v30.8h, v26.8h\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip1 v25.8h, v29.8h, v23.8h\n" + "zip1 v18.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip2 v24.8h, v30.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v23.8h, v29.8h, v23.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "sshll v21.8h, v21.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v19.8h, v28.8h, v21.8h\n" + "zip2 v19.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v23.8h, v22.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v19.8h, v20.8h, v19.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v20.8h, v28.8h, v21.8h\n" - "zip1 v21.8h, v24.8h, v20.8h\n" - "zip2 v20.8h, v24.8h, v20.8h\n" - "sshll v26.8h, v26.8b, #0x0\n" - "zip1 v18.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v25.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "zip2 v17.8h, v21.8h, v20.8h\n" + "add v2.8h, v2.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v22.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v16.8h, v25.8h, v18.8h\n" - "add v1.8h, v1.8h, v17.8h\n" - "zip1 v17.8h, v19.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v19.8h, v16.8h\n" + "add x20, x20, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "str q19, [%x[out_ptr], #0x10]\n" + "zip2 v21.8h, v29.8h, v25.8h\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v19.8h, v28.8h, v24.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "add v1.8h, v1.8h, v17.8h\n" - "zip2 v19.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip1 v16.8h, v21.8h, v17.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v18.8h, v21.8h, v17.8h\n" - "str q18, [%x[out_ptr], #0x50]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v20.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x60]\n" - "add v1.8h, v1.8h, v18.8h\n" - "zip2 v16.8h, v20.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x70]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" - "add v1.8h, v1.8h, v17.8h\n" - "add v1.8h, v1.8h, v16.8h\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" + "ldr s31, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" "ldr s29, [x26], #0x4\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s23, [x22], #0x4\n" - "ldr s21, [x21], #0x4\n" - "ldr s26, [x20], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" "tbz %x[width], #1, 6f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" "ld1 { v30.h }[2], [x27], #0x2\n" - "mov x19, #0x6\n" + "mov x20, #0x6\n" "ld1 { v29.h }[2], [x26], #0x2\n" "ld1 { v28.h }[2], [x25], #0x2\n" "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v24.h }[2], [x23], #0x2\n" - "ld1 { v23.h }[2], [x22], #0x2\n" - "ld1 { v21.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v26.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v24.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.b }[6], [x28]\n" "ld1 { v30.b }[6], [x27]\n" - "mov x19, #0x7\n" + "mov x20, #0x7\n" "ld1 { v29.b }[6], [x26]\n" "ld1 { v28.b }[6], [x25]\n" "ld1 { v27.b }[6], [x24]\n" - "ld1 { v24.b }[6], [x23]\n" - "ld1 { v23.b }[6], [x22]\n" - "ld1 { v21.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" + "ld1 { v26.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v24.b }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.b }[4], [x28]\n" "ld1 { v30.b }[4], [x27]\n" + "mov x20, #0x5\n" "ld1 { v29.b }[4], [x26]\n" - "mov x19, #0x5\n" "ld1 { v28.b }[4], [x25]\n" "ld1 { v27.b }[4], [x24]\n" - "ld1 { v24.b }[4], [x23]\n" - "ld1 { v23.b }[4], [x22]\n" - "ld1 { v21.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" + "ld1 { v26.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v24.b }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" + "ldr h31, [x28], #0x2\n" "ldr h30, [x27], #0x2\n" + "mov x20, #0x2\n" "ldr h29, [x26], #0x2\n" - "mov x19, #0x2\n" "ldr h28, [x25], #0x2\n" "ldr h27, [x24], #0x2\n" - "ldr h24, [x23], #0x2\n" - "ldr h23, [x22], #0x2\n" - "ldr h21, [x21], #0x2\n" - "ldr h26, [x20], #0x2\n" + "ldr h26, [x23], #0x2\n" + "ldr h25, [x22], #0x2\n" + "ldr h24, [x21], #0x2\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.b }[2], [x28]\n" "ld1 { v30.b }[2], [x27]\n" - "mov x19, #0x3\n" + "mov x20, #0x3\n" "ld1 { v29.b }[2], [x26]\n" "ld1 { v28.b }[2], [x25]\n" "ld1 { v27.b }[2], [x24]\n" - "ld1 { v24.b }[2], [x23]\n" - "ld1 { v23.b }[2], [x22]\n" - "ld1 { v21.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" + "ld1 { v26.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v24.b }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 + "ldr b31, [x28, #0x0]\n" "ldr b30, [x27, #0x0]\n" - "mov x19, #0x1\n" + "mov x20, #0x1\n" "ldr b29, [x26, #0x0]\n" "ldr b28, [x25, #0x0]\n" "ldr b27, [x24, #0x0]\n" - "ldr b24, [x23, #0x0]\n" - "ldr b23, [x22, #0x0]\n" - "ldr b21, [x21, #0x0]\n" - "ldr b26, [x20, #0x0]\n" + "ldr b26, [x23, #0x0]\n" + "ldr b25, [x22, #0x0]\n" + "ldr b24, [x21, #0x0]\n" "9:" // Odd load end + "sshll v31.8h, v31.8b, #0x0\n" "sshll v30.8h, v30.8b, #0x0\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" "sshll v29.8h, v29.8b, #0x0\n" "sshll v28.8h, v28.8b, #0x0\n" "sshll v27.8h, v27.8b, #0x0\n" - "sshll v24.8h, v24.8b, #0x0\n" - "zip1 v20.8h, v30.8h, v24.8h\n" - "sshll v23.8h, v23.8b, #0x0\n" - "zip1 v25.8h, v29.8h, v23.8h\n" - "sshll v21.8h, v21.8b, #0x0\n" - "zip1 v19.8h, v28.8h, v21.8h\n" - "zip1 v22.8h, v20.8h, v19.8h\n" "sshll v26.8h, v26.8b, #0x0\n" - "zip1 v18.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v25.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "sshll v25.8h, v25.8b, #0x0\n" + "sshll v24.8h, v24.8b, #0x0\n" + "zip1 v23.8h, v31.8h, v27.8h\n" + "zip1 v22.8h, v29.8h, v25.8h\n" + "zip1 v21.8h, v30.8h, v26.8h\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v17.8h, v22.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v17.8h\n" + "zip2 v19.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "str q19, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v19.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v19.8h, v20.8h, v19.8h\n" - "zip2 v16.8h, v25.8h, v18.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v17.8h, v19.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v18.8h, v23.8h, v22.8h\n" + "zip2 v17.8h, v21.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v16.8h, v19.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v24.8h, v30.8h, v24.8h\n" - "zip2 v20.8h, v28.8h, v21.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v21.8h, v24.8h, v20.8h\n" - "zip2 v23.8h, v29.8h, v23.8h\n" - "zip2 v19.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "zip1 v16.8h, v21.8h, v17.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "zip2 v21.8h, v29.8h, v25.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "zip2 v19.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v18.8h, v21.8h, v17.8h\n" - "str q18, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v18.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v20.8h, v24.8h, v20.8h\n" - "zip2 v16.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v20.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v17.8h\n" "10:" // Odds skip - "saddw v0.4s, v0.4s, v1.4h\n" - "str q0, [%x[out_ptr], #0x0]\n" - "saddw2 v31.4s, v31.4s, v1.8h\n" - "str q31, [%x[out_ptr], #0x10]\n" + "saddw v1.4s, v1.4s, v2.4h\n" + "saddw2 v0.4s, v0.4s, v2.8h\n" + "str q1, [%x[out_ptr], #0x0]\n" + "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp index 24ece9a68e..de29d77a22 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>( ) { __asm__ __volatile__( - "movi v1.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" - "movi v0.4s, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "movi v31.4s, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "movi v2.8h, #0x0\n" + "movi v1.4s, #0x0\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" + "movi v0.4s, #0x0\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x24, [%x[in], #0x18]\n" - "ldr x23, [%x[in], #0x20]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -79,226 +80,225 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" - "ld1 { v0.4s }, [%x[out_ptr]]\n" - "ldr q31, [%x[out_ptr], #0x10]\n" + "ld1 { v1.4s }, [%x[out_ptr]]\n" + "ldr q0, [%x[out_ptr], #0x10]\n" "2:" // first_pass "cmp %x[width], #0x8\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0xe\n" + "cmp x20, #0xe\n" "ble 4f\n" - "uaddw v0.4s, v0.4s, v1.4h\n" - "uaddw2 v31.4s, v31.4s, v1.8h\n" - "mov x19, #0x0\n" - "movi v1.8h, #0x0\n" + "uaddw v1.4s, v1.4s, v2.4h\n" + "uaddw2 v0.4s, v0.4s, v2.8h\n" + "mov x20, #0x0\n" + "movi v2.8h, #0x0\n" "4:" // no_accumulate_16 + "ldr q31, [x28], #0x10\n" "ldr q30, [x27], #0x10\n" - "add x19, x19, #0x1\n" - "ldr q29, [x26], #0x10\n" "subs %x[width], %x[width], #0x8\n" - "ldr q28, [x25], #0x10\n" "cmp %x[width], #0x8\n" + "ldr q29, [x26], #0x10\n" + "ldr q28, [x25], #0x10\n" + "add x20, x20, #0x1\n" "ldr q27, [x24], #0x10\n" - "ldr q25, [x23], #0x10\n" - "zip1 v26.8h, v30.8h, v25.8h\n" - "ldr q21, [x22], #0x10\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "ldr q24, [x21], #0x10\n" - "ldr q23, [x20], #0x10\n" - "zip1 v22.8h, v29.8h, v21.8h\n" + "ldr q26, [x23], #0x10\n" + "zip1 v25.8h, v31.8h, v27.8h\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "ldr q24, [x22], #0x10\n" + "ldr q23, [x21], #0x10\n" + "zip1 v18.8h, v29.8h, v24.8h\n" + "zip1 v21.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v21.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v21.8h, v29.8h, v21.8h\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "add v2.8h, v2.8h, v20.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v20.8h, v28.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.8h, v26.8h, v20.8h\n" + "zip2 v19.8h, v17.8h, v16.8h\n" + "zip2 v18.8h, v25.8h, v18.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v19.8h, v27.8h, v23.8h\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v16.8h, v22.8h, v19.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "add v2.8h, v2.8h, v19.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v17.8h, v18.8h, v16.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "add v1.8h, v1.8h, v17.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "str q16, [%x[out_ptr], #0x10]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "str q20, [%x[out_ptr], #0x0]\n" + "zip2 v21.8h, v29.8h, v24.8h\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "str q19, [%x[out_ptr], #0x10]\n" + "zip2 v19.8h, v28.8h, v23.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x20]\n" - "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x40]\n" - "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x60]\n" - "add v1.8h, v1.8h, v16.8h\n" "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x70]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" - "add v1.8h, v1.8h, v16.8h\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" + "ldr d31, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" "ldr d29, [x26], #0x8\n" "ldr d28, [x25], #0x8\n" "ldr d27, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d21, [x22], #0x8\n" - "ldr d24, [x21], #0x8\n" - "ldr d23, [x20], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" "tbz %x[width], #1, 6f\n" + "ld1 { v31.s }[2], [x28], #0x4\n" "ld1 { v30.s }[2], [x27], #0x4\n" - "mov x19, #0x6\n" + "mov x20, #0x6\n" "ld1 { v29.s }[2], [x26], #0x4\n" "ld1 { v28.s }[2], [x25], #0x4\n" "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v25.s }[2], [x23], #0x4\n" - "ld1 { v21.s }[2], [x22], #0x4\n" - "ld1 { v24.s }[2], [x21], #0x4\n" - "ld1 { v23.s }[2], [x20], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.h }[6], [x28]\n" "ld1 { v30.h }[6], [x27]\n" - "mov x19, #0x7\n" + "mov x20, #0x7\n" "ld1 { v29.h }[6], [x26]\n" "ld1 { v28.h }[6], [x25]\n" "ld1 { v27.h }[6], [x24]\n" - "ld1 { v25.h }[6], [x23]\n" - "ld1 { v21.h }[6], [x22]\n" - "ld1 { v24.h }[6], [x21]\n" - "ld1 { v23.h }[6], [x20]\n" + "ld1 { v26.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.h }[4], [x28]\n" "ld1 { v30.h }[4], [x27]\n" + "mov x20, #0x5\n" "ld1 { v29.h }[4], [x26]\n" - "mov x19, #0x5\n" "ld1 { v28.h }[4], [x25]\n" "ld1 { v27.h }[4], [x24]\n" - "ld1 { v25.h }[4], [x23]\n" - "ld1 { v21.h }[4], [x22]\n" - "ld1 { v24.h }[4], [x21]\n" - "ld1 { v23.h }[4], [x20]\n" + "ld1 { v26.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" + "ldr s31, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" + "mov x20, #0x2\n" "ldr s29, [x26], #0x4\n" - "mov x19, #0x2\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s21, [x22], #0x4\n" - "ldr s24, [x21], #0x4\n" - "ldr s23, [x20], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.h }[2], [x28]\n" "ld1 { v30.h }[2], [x27]\n" - "mov x19, #0x3\n" + "mov x20, #0x3\n" "ld1 { v29.h }[2], [x26]\n" "ld1 { v28.h }[2], [x25]\n" "ld1 { v27.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v21.h }[2], [x22]\n" - "ld1 { v24.h }[2], [x21]\n" - "ld1 { v23.h }[2], [x20]\n" + "ld1 { v26.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 + "ldr h31, [x28, #0x0]\n" "ldr h30, [x27, #0x0]\n" - "mov x19, #0x1\n" + "mov x20, #0x1\n" "ldr h29, [x26, #0x0]\n" "ldr h28, [x25, #0x0]\n" "ldr h27, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h21, [x22, #0x0]\n" - "ldr h24, [x21, #0x0]\n" - "ldr h23, [x20, #0x0]\n" + "ldr h26, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" "9:" // Odd load end - "zip1 v26.8h, v30.8h, v25.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v26.8h, v20.8h\n" - "zip1 v22.8h, v29.8h, v21.8h\n" - "zip1 v19.8h, v27.8h, v23.8h\n" - "zip1 v16.8h, v22.8h, v19.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v25.8h, v31.8h, v27.8h\n" + "zip1 v18.8h, v29.8h, v24.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v23.8h\n" + "zip1 v17.8h, v25.8h, v18.8h\n" + "zip1 v16.8h, v22.8h, v21.8h\n" + "zip1 v20.8h, v17.8h, v16.8h\n" + "str q20, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v20.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "zip2 v19.8h, v17.8h, v16.8h\n" + "subs x20, x20, #0x1\n" + "str q19, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v19.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v26.8h, v20.8h\n" - "zip2 v17.8h, v22.8h, v19.8h\n" - "subs x19, x19, #0x1\n" + "zip2 v18.8h, v25.8h, v18.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "subs x20, x20, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v25.8h, v30.8h, v25.8h\n" - "zip2 v20.8h, v28.8h, v24.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v18.8h, v25.8h, v20.8h\n" - "zip2 v21.8h, v29.8h, v21.8h\n" - "zip2 v19.8h, v27.8h, v23.8h\n" - "zip1 v17.8h, v21.8h, v19.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "zip2 v21.8h, v29.8h, v24.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "zip2 v19.8h, v28.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v25.8h, v20.8h\n" - "zip2 v17.8h, v21.8h, v19.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "10:" // Odds skip - "uaddw v0.4s, v0.4s, v1.4h\n" - "str q0, [%x[out_ptr], #0x0]\n" - "uaddw2 v31.4s, v31.4s, v1.8h\n" - "str q31, [%x[out_ptr], #0x10]\n" + "uaddw v1.4s, v1.4s, v2.4h\n" + "uaddw2 v0.4s, v0.4s, v2.8h\n" + "str q1, [%x[out_ptr], #0x0]\n" + "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp index 0db2f7fd51..43a3a46801 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 1, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset]\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset]\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset]\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,209 +77,208 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d31, [x27], #0x8\n" - "ushll v31.8h, v31.8b, #0x0\n" - "ldr d30, [x26], #0x8\n" - "subs %x[width], %x[width], #0x8\n" + "ldr d25, [x28], #0x8\n" + "ldr d30, [x27], #0x8\n" + "ushll v25.8h, v25.8b, #0x0\n" "ushll v30.8h, v30.8b, #0x0\n" - "ldr d29, [x25], #0x8\n" - "cmp %x[width], #0x8\n" + "ldr d29, [x26], #0x8\n" + "ldr d28, [x25], #0x8\n" "ushll v29.8h, v29.8b, #0x0\n" - "ldr d28, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" "ushll v28.8h, v28.8b, #0x0\n" - "ldr d23, [x22], #0x8\n" - "ushll v25.8h, v25.8b, #0x0\n" - "ldr d27, [x21], #0x8\n" - "zip1 v20.8h, v31.8h, v25.8h\n" - "ldr d26, [x20], #0x8\n" - "zip2 v25.8h, v31.8h, v25.8h\n" + "ldr d21, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ushll v21.8h, v21.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ldr d20, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "ushll v20.8h, v20.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v22.8h, v29.8h, v20.8h\n" + "subs %x[width], %x[width], #0x8\n" + "cmp %x[width], #0x8\n" + "zip1 v19.8h, v30.8h, v27.8h\n" + "zip1 v18.8h, v28.8h, v26.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "ushll v23.8h, v23.8b, #0x0\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v24.8h, v30.8h, v23.8h\n" - "zip2 v23.8h, v30.8h, v23.8h\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v26.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ushll v27.8h, v27.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v19.8h, v29.8h, v27.8h\n" + "zip1 v24.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v19.8h, v18.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v21.8h, v20.8h, v19.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v19.8h, v29.8h, v27.8h\n" - "zip1 v20.8h, v25.8h, v19.8h\n" - "zip2 v19.8h, v25.8h, v19.8h\n" - "ushll v26.8h, v26.8b, #0x0\n" - "zip1 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "zip2 v23.8h, v23.8h, v22.8h\n" + "zip2 v19.8h, v19.8h, v18.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.8h, v22.8h, v17.8h\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x10]\n" - "zip2 v17.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v21.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v21.8h, v17.8h\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v18.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "zip2 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x50]\n" - "zip2 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v19.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x60]\n" - "zip2 v16.8h, v19.8h, v17.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "zip2 v16.8h, v21.8h, v20.8h\n" + "str q19, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr s31, [x27], #0x4\n" - "ldr s30, [x26], #0x4\n" - "ldr s29, [x25], #0x4\n" - "ldr s28, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s23, [x22], #0x4\n" - "ldr s27, [x21], #0x4\n" - "ldr s26, [x20], #0x4\n" + "ldr s25, [x28], #0x4\n" + "ldr s30, [x27], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v31.h }[2], [x27], #0x2\n" - "mov x19, #0x6\n" - "ld1 { v30.h }[2], [x26], #0x2\n" - "ld1 { v29.h }[2], [x25], #0x2\n" - "ld1 { v28.h }[2], [x24], #0x2\n" - "ld1 { v25.h }[2], [x23], #0x2\n" - "ld1 { v23.h }[2], [x22], #0x2\n" - "ld1 { v27.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v25.h }[2], [x28], #0x2\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "mov x20, #0x6\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "ld1 { v20.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v31.b }[6], [x27]\n" - "mov x19, #0x7\n" - "ld1 { v30.b }[6], [x26]\n" - "ld1 { v29.b }[6], [x25]\n" - "ld1 { v28.b }[6], [x24]\n" - "ld1 { v25.b }[6], [x23]\n" - "ld1 { v23.b }[6], [x22]\n" - "ld1 { v27.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" + "ld1 { v25.b }[6], [x28]\n" + "ld1 { v30.b }[6], [x27]\n" + "mov x20, #0x7\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v27.b }[6], [x23]\n" + "ld1 { v20.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v31.b }[4], [x27]\n" - "ld1 { v30.b }[4], [x26]\n" - "mov x19, #0x5\n" - "ld1 { v29.b }[4], [x25]\n" - "ld1 { v28.b }[4], [x24]\n" - "ld1 { v25.b }[4], [x23]\n" - "ld1 { v23.b }[4], [x22]\n" - "ld1 { v27.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" + "ld1 { v25.b }[4], [x28]\n" + "ld1 { v30.b }[4], [x27]\n" + "mov x20, #0x5\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v27.b }[4], [x23]\n" + "ld1 { v20.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr h31, [x27], #0x2\n" - "ldr h30, [x26], #0x2\n" - "mov x19, #0x2\n" - "ldr h29, [x25], #0x2\n" - "ldr h28, [x24], #0x2\n" - "ldr h25, [x23], #0x2\n" - "ldr h23, [x22], #0x2\n" - "ldr h27, [x21], #0x2\n" - "ldr h26, [x20], #0x2\n" + "ldr h25, [x28], #0x2\n" + "ldr h30, [x27], #0x2\n" + "mov x20, #0x2\n" + "ldr h29, [x26], #0x2\n" + "ldr h28, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h27, [x23], #0x2\n" + "ldr h20, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v31.b }[2], [x27]\n" - "mov x19, #0x3\n" - "ld1 { v30.b }[2], [x26]\n" - "ld1 { v29.b }[2], [x25]\n" - "ld1 { v28.b }[2], [x24]\n" - "ld1 { v25.b }[2], [x23]\n" - "ld1 { v23.b }[2], [x22]\n" - "ld1 { v27.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" + "ld1 { v25.b }[2], [x28]\n" + "ld1 { v30.b }[2], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v27.b }[2], [x23]\n" + "ld1 { v20.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr b31, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b30, [x26, #0x0]\n" - "ldr b29, [x25, #0x0]\n" - "ldr b28, [x24, #0x0]\n" - "ldr b25, [x23, #0x0]\n" - "ldr b23, [x22, #0x0]\n" - "ldr b27, [x21, #0x0]\n" - "ldr b26, [x20, #0x0]\n" + "ldr b25, [x28, #0x0]\n" + "ldr b30, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b29, [x26, #0x0]\n" + "ldr b28, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b27, [x23, #0x0]\n" + "ldr b20, [x22, #0x0]\n" + "ldr b26, [x21, #0x0]\n" "7:" // Odd load end - "ushll v31.8h, v31.8b, #0x0\n" - "subs x19, x19, #0x1\n" + "ushll v25.8h, v25.8b, #0x0\n" "ushll v30.8h, v30.8b, #0x0\n" + "subs x20, x20, #0x1\n" "ushll v29.8h, v29.8b, #0x0\n" "ushll v28.8h, v28.8b, #0x0\n" - "ushll v25.8h, v25.8b, #0x0\n" - "zip1 v20.8h, v31.8h, v25.8h\n" - "ushll v23.8h, v23.8b, #0x0\n" - "zip1 v24.8h, v30.8h, v23.8h\n" + "ushll v21.8h, v21.8b, #0x0\n" "ushll v27.8h, v27.8b, #0x0\n" - "zip1 v19.8h, v29.8h, v27.8h\n" - "zip1 v22.8h, v20.8h, v19.8h\n" + "ushll v20.8h, v20.8b, #0x0\n" "ushll v26.8h, v26.8b, #0x0\n" + "zip1 v23.8h, v25.8h, v21.8h\n" + "zip1 v22.8h, v29.8h, v20.8h\n" + "zip1 v19.8h, v30.8h, v27.8h\n" "zip1 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "zip1 v24.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v19.8h, v18.8h\n" + "zip1 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v22.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v24.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v21.8h, v20.8h, v19.8h\n" - "zip2 v17.8h, v24.8h, v18.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v16.8h, v21.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v23.8h, v23.8h, v22.8h\n" + "zip2 v19.8h, v19.8h, v18.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v17.8h, v23.8h, v19.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v21.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v16.8h, v23.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v31.8h, v25.8h\n" - "zip2 v19.8h, v29.8h, v27.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v20.8h, v25.8h, v19.8h\n" - "zip2 v23.8h, v30.8h, v23.8h\n" - "zip2 v18.8h, v28.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v25.8h, v25.8h, v21.8h\n" + "zip2 v21.8h, v29.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v27.8h\n" + "zip2 v16.8h, v28.8h, v26.8h\n" + "zip1 v22.8h, v25.8h, v21.8h\n" + "zip1 v18.8h, v20.8h, v16.8h\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "str q19, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v16.8h, v20.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" + "zip2 v18.8h, v22.8h, v18.8h\n" + "str q18, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v19.8h, v25.8h, v19.8h\n" - "zip2 v17.8h, v23.8h, v18.8h\n" - "zip1 v16.8h, v19.8h, v17.8h\n" - "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v21.8h, v25.8h, v21.8h\n" + "zip2 v20.8h, v20.8h, v16.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "str q17, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp index 7c7d774a6b..3ab24365af 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,39 +31,40 @@ void interleave_block<8, 1, VLType::None, true>( ) { __asm__ __volatile__( - "movi v1.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" - "movi v0.4s, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "movi v31.4s, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "movi v2.8h, #0x0\n" + "movi v1.4s, #0x0\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" + "movi v0.4s, #0x0\n" + "add x28, x28, %x[row_offset]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x27, x27, %x[row_offset]\n" - "ldr x24, [%x[in], #0x18]\n" - "ldr x23, [%x[in], #0x20]\n" "add x26, x26, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -71,7 +72,7 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -79,242 +80,241 @@ void interleave_block<8, 1, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" - "ld1 { v0.4s }, [%x[out_ptr]]\n" - "ldr q31, [%x[out_ptr], #0x10]\n" + "ld1 { v1.4s }, [%x[out_ptr]]\n" + "ldr q0, [%x[out_ptr], #0x10]\n" "2:" // first_pass "cmp %x[width], #0x8\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0xe\n" + "cmp x20, #0xe\n" "ble 4f\n" - "uaddw v0.4s, v0.4s, v1.4h\n" - "uaddw2 v31.4s, v31.4s, v1.8h\n" - "mov x19, #0x0\n" - "movi v1.8h, #0x0\n" + "uaddw v1.4s, v1.4s, v2.4h\n" + "uaddw2 v0.4s, v0.4s, v2.8h\n" + "mov x20, #0x0\n" + "movi v2.8h, #0x0\n" "4:" // no_accumulate_16 + "ldr d31, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" + "ushll v31.8h, v31.8b, #0x0\n" "ushll v30.8h, v30.8b, #0x0\n" "ldr d29, [x26], #0x8\n" - "add x19, x19, #0x1\n" - "ushll v29.8h, v29.8b, #0x0\n" "ldr d28, [x25], #0x8\n" - "subs %x[width], %x[width], #0x8\n" + "ushll v29.8h, v29.8b, #0x0\n" "ushll v28.8h, v28.8b, #0x0\n" "ldr d27, [x24], #0x8\n" - "cmp %x[width], #0x8\n" + "ldr d26, [x23], #0x8\n" "ushll v27.8h, v27.8b, #0x0\n" - "ldr d24, [x23], #0x8\n" - "ldr d23, [x22], #0x8\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ldr d25, [x22], #0x8\n" + "ldr d24, [x21], #0x8\n" + "ushll v25.8h, v25.8b, #0x0\n" "ushll v24.8h, v24.8b, #0x0\n" - "ldr d21, [x21], #0x8\n" - "ushll v23.8h, v23.8b, #0x0\n" - "ldr d26, [x20], #0x8\n" - "zip1 v20.8h, v30.8h, v24.8h\n" + "zip1 v23.8h, v31.8h, v27.8h\n" + "zip1 v22.8h, v29.8h, v25.8h\n" + "subs %x[width], %x[width], #0x8\n" + "cmp %x[width], #0x8\n" + "zip1 v21.8h, v30.8h, v26.8h\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip1 v25.8h, v29.8h, v23.8h\n" + "zip1 v18.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip2 v24.8h, v30.8h, v24.8h\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v23.8h, v29.8h, v23.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "prfm pldl1keep, [x24, #0x70]\n" - "ushll v21.8h, v21.8b, #0x0\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v19.8h, v28.8h, v21.8h\n" + "zip2 v19.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v23.8h, v22.8h\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip1 v22.8h, v20.8h, v19.8h\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v19.8h, v20.8h, v19.8h\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v20.8h, v28.8h, v21.8h\n" - "zip1 v21.8h, v24.8h, v20.8h\n" - "zip2 v20.8h, v24.8h, v20.8h\n" - "ushll v26.8h, v26.8b, #0x0\n" - "zip1 v18.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v25.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "zip2 v17.8h, v21.8h, v20.8h\n" + "add v2.8h, v2.8h, v19.8h\n" "str q16, [%x[out_ptr], #0x0]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v17.8h, v22.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v16.8h, v25.8h, v18.8h\n" - "add v1.8h, v1.8h, v17.8h\n" - "zip1 v17.8h, v19.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x20]\n" - "zip2 v16.8h, v19.8h, v16.8h\n" + "add x20, x20, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "str q19, [%x[out_ptr], #0x10]\n" + "zip2 v21.8h, v29.8h, v25.8h\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v19.8h, v28.8h, v24.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" "str q16, [%x[out_ptr], #0x30]\n" - "add v1.8h, v1.8h, v17.8h\n" - "zip2 v19.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip1 v16.8h, v21.8h, v17.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x40]\n" - "zip2 v18.8h, v21.8h, v17.8h\n" - "str q18, [%x[out_ptr], #0x50]\n" - "add v1.8h, v1.8h, v16.8h\n" - "zip2 v16.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v20.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x60]\n" - "add v1.8h, v1.8h, v18.8h\n" - "zip2 v16.8h, v20.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "add v2.8h, v2.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x60]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x70]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" - "add v1.8h, v1.8h, v17.8h\n" - "add v1.8h, v1.8h, v16.8h\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" + "ldr s31, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" "ldr s29, [x26], #0x4\n" "ldr s28, [x25], #0x4\n" "ldr s27, [x24], #0x4\n" - "ldr s24, [x23], #0x4\n" - "ldr s23, [x22], #0x4\n" - "ldr s21, [x21], #0x4\n" - "ldr s26, [x20], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s25, [x22], #0x4\n" + "ldr s24, [x21], #0x4\n" "tbz %x[width], #1, 6f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" "ld1 { v30.h }[2], [x27], #0x2\n" - "mov x19, #0x6\n" + "mov x20, #0x6\n" "ld1 { v29.h }[2], [x26], #0x2\n" "ld1 { v28.h }[2], [x25], #0x2\n" "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v24.h }[2], [x23], #0x2\n" - "ld1 { v23.h }[2], [x22], #0x2\n" - "ld1 { v21.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v26.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v24.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.b }[6], [x28]\n" "ld1 { v30.b }[6], [x27]\n" - "mov x19, #0x7\n" + "mov x20, #0x7\n" "ld1 { v29.b }[6], [x26]\n" "ld1 { v28.b }[6], [x25]\n" "ld1 { v27.b }[6], [x24]\n" - "ld1 { v24.b }[6], [x23]\n" - "ld1 { v23.b }[6], [x22]\n" - "ld1 { v21.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" + "ld1 { v26.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v24.b }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 - "mov x19, #0x4\n" + "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.b }[4], [x28]\n" "ld1 { v30.b }[4], [x27]\n" + "mov x20, #0x5\n" "ld1 { v29.b }[4], [x26]\n" - "mov x19, #0x5\n" "ld1 { v28.b }[4], [x25]\n" "ld1 { v27.b }[4], [x24]\n" - "ld1 { v24.b }[4], [x23]\n" - "ld1 { v23.b }[4], [x22]\n" - "ld1 { v21.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" + "ld1 { v26.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v24.b }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" + "ldr h31, [x28], #0x2\n" "ldr h30, [x27], #0x2\n" + "mov x20, #0x2\n" "ldr h29, [x26], #0x2\n" - "mov x19, #0x2\n" "ldr h28, [x25], #0x2\n" "ldr h27, [x24], #0x2\n" - "ldr h24, [x23], #0x2\n" - "ldr h23, [x22], #0x2\n" - "ldr h21, [x21], #0x2\n" - "ldr h26, [x20], #0x2\n" + "ldr h26, [x23], #0x2\n" + "ldr h25, [x22], #0x2\n" + "ldr h24, [x21], #0x2\n" "tbz %x[width], #0, 9f\n" + "ld1 { v31.b }[2], [x28]\n" "ld1 { v30.b }[2], [x27]\n" - "mov x19, #0x3\n" + "mov x20, #0x3\n" "ld1 { v29.b }[2], [x26]\n" "ld1 { v28.b }[2], [x25]\n" "ld1 { v27.b }[2], [x24]\n" - "ld1 { v24.b }[2], [x23]\n" - "ld1 { v23.b }[2], [x22]\n" - "ld1 { v21.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" + "ld1 { v26.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v24.b }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 + "ldr b31, [x28, #0x0]\n" "ldr b30, [x27, #0x0]\n" - "mov x19, #0x1\n" + "mov x20, #0x1\n" "ldr b29, [x26, #0x0]\n" "ldr b28, [x25, #0x0]\n" "ldr b27, [x24, #0x0]\n" - "ldr b24, [x23, #0x0]\n" - "ldr b23, [x22, #0x0]\n" - "ldr b21, [x21, #0x0]\n" - "ldr b26, [x20, #0x0]\n" + "ldr b26, [x23, #0x0]\n" + "ldr b25, [x22, #0x0]\n" + "ldr b24, [x21, #0x0]\n" "9:" // Odd load end + "ushll v31.8h, v31.8b, #0x0\n" "ushll v30.8h, v30.8b, #0x0\n" - "subs x19, x19, #0x1\n" + "subs x20, x20, #0x1\n" "ushll v29.8h, v29.8b, #0x0\n" "ushll v28.8h, v28.8b, #0x0\n" "ushll v27.8h, v27.8b, #0x0\n" - "ushll v24.8h, v24.8b, #0x0\n" - "zip1 v20.8h, v30.8h, v24.8h\n" - "ushll v23.8h, v23.8b, #0x0\n" - "zip1 v25.8h, v29.8h, v23.8h\n" - "ushll v21.8h, v21.8b, #0x0\n" - "zip1 v19.8h, v28.8h, v21.8h\n" - "zip1 v22.8h, v20.8h, v19.8h\n" "ushll v26.8h, v26.8b, #0x0\n" - "zip1 v18.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v25.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v17.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" + "zip1 v23.8h, v31.8h, v27.8h\n" + "zip1 v22.8h, v29.8h, v25.8h\n" + "zip1 v21.8h, v30.8h, v26.8h\n" + "zip1 v20.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v23.8h, v22.8h\n" + "zip1 v17.8h, v21.8h, v20.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v17.8h, v22.8h, v17.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v17.8h\n" + "zip2 v19.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "str q19, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v19.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v19.8h, v20.8h, v19.8h\n" - "zip2 v16.8h, v25.8h, v18.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v17.8h, v19.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v18.8h, v23.8h, v22.8h\n" + "zip2 v17.8h, v21.8h, v20.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v17.8h\n" "beq 10f\n" - "zip2 v16.8h, v19.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" "str q16, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v16.8h\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v24.8h, v30.8h, v24.8h\n" - "zip2 v20.8h, v28.8h, v21.8h\n" - "subs x19, x19, #0x1\n" - "zip1 v21.8h, v24.8h, v20.8h\n" - "zip2 v23.8h, v29.8h, v23.8h\n" - "zip2 v19.8h, v27.8h, v26.8h\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "zip1 v16.8h, v21.8h, v17.8h\n" + "zip2 v22.8h, v31.8h, v27.8h\n" + "zip2 v21.8h, v29.8h, v25.8h\n" + "subs x20, x20, #0x1\n" + "zip2 v20.8h, v30.8h, v26.8h\n" + "zip2 v19.8h, v28.8h, v24.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v16.8h\n" "beq 10f\n" - "zip2 v18.8h, v21.8h, v17.8h\n" - "str q18, [%x[out_ptr], #0x0]\n" - "subs x19, x19, #0x1\n" - "add v1.8h, v1.8h, v18.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "subs x20, x20, #0x1\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v20.8h, v24.8h, v20.8h\n" - "zip2 v16.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v20.8h, v16.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" - "add v1.8h, v1.8h, v17.8h\n" "10:" // Odds skip - "uaddw v0.4s, v0.4s, v1.4h\n" - "str q0, [%x[out_ptr], #0x0]\n" - "uaddw2 v31.4s, v31.4s, v1.8h\n" - "str q31, [%x[out_ptr], #0x10]\n" + "uaddw v1.4s, v1.4s, v2.4h\n" + "uaddw2 v0.4s, v0.4s, v2.8h\n" + "str q1, [%x[out_ptr], #0x0]\n" + "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp index 1e5d395667..d4d150456f 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 2, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,49 +77,48 @@ void interleave_block<8, 2, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q28, [x27], #0x10\n" + "ldr q28, [x28], #0x10\n" + "ldr q27, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" - "ldr q29, [x26], #0x10\n" "cmp %x[width], #0x8\n" - "ldr q25, [x25], #0x10\n" - "zip1 v22.4s, v28.4s, v25.4s\n" - "ldr q21, [x24], #0x10\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "ldr q27, [x23], #0x10\n" - "ldr q26, [x22], #0x10\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "ldr q19, [x21], #0x10\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "ldr q24, [x20], #0x10\n" - "zip1 v23.4s, v22.4s, v20.4s\n" + "ldr q22, [x26], #0x10\n" + "ldr q21, [x25], #0x10\n" + "zip1 v26.4s, v28.4s, v22.4s\n" + "zip1 v25.4s, v27.4s, v21.4s\n" + "ldr q24, [x24], #0x10\n" + "ldr q23, [x23], #0x10\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "ldr q19, [x22], #0x10\n" + "ldr q18, [x21], #0x10\n" + "zip1 v20.4s, v24.4s, v19.4s\n" + "zip1 v17.4s, v23.4s, v18.4s\n" + "zip2 v19.4s, v24.4s, v19.4s\n" + "zip2 v18.4s, v23.4s, v18.4s\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v17.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v17.4s, v28.4s, v25.4s\n" - "str q22, [%x[out_ptr], #0x20]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" - "str q21, [%x[out_ptr], #0x40]\n" - "str q18, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" @@ -126,120 +126,120 @@ void interleave_block<8, 2, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d26, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d24, [x20], #0x8\n" + "ldr d28, [x28], #0x8\n" + "ldr d27, [x27], #0x8\n" + "ldr d22, [x26], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d24, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" - "ld1 { v28.s }[2], [x27], #0x4\n" - "mov x19, #0x3\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v26.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v24.s }[2], [x20], #0x4\n" + "ld1 { v28.s }[2], [x28], #0x4\n" + "ld1 { v27.s }[2], [x27], #0x4\n" + "mov x20, #0x3\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "ld1 { v21.s }[2], [x25], #0x4\n" + "ld1 { v24.s }[2], [x24], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v28.h }[6], [x27]\n" - "mov x19, #0x4\n" - "ld1 { v29.h }[6], [x26]\n" - "ld1 { v25.h }[6], [x25]\n" - "ld1 { v21.h }[6], [x24]\n" - "ld1 { v27.h }[6], [x23]\n" - "ld1 { v26.h }[6], [x22]\n" - "ld1 { v19.h }[6], [x21]\n" - "ld1 { v24.h }[6], [x20]\n" + "ld1 { v28.h }[6], [x28]\n" + "ld1 { v27.h }[6], [x27]\n" + "mov x20, #0x4\n" + "ld1 { v22.h }[6], [x26]\n" + "ld1 { v21.h }[6], [x25]\n" + "ld1 { v24.h }[6], [x24]\n" + "ld1 { v23.h }[6], [x23]\n" + "ld1 { v19.h }[6], [x22]\n" + "ld1 { v18.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v28.h }[4], [x27]\n" - "ld1 { v29.h }[4], [x26]\n" - "mov x19, #0x3\n" - "ld1 { v25.h }[4], [x25]\n" - "ld1 { v21.h }[4], [x24]\n" - "ld1 { v27.h }[4], [x23]\n" - "ld1 { v26.h }[4], [x22]\n" - "ld1 { v19.h }[4], [x21]\n" - "ld1 { v24.h }[4], [x20]\n" + "ld1 { v28.h }[4], [x28]\n" + "ld1 { v27.h }[4], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v22.h }[4], [x26]\n" + "ld1 { v21.h }[4], [x25]\n" + "ld1 { v24.h }[4], [x24]\n" + "ld1 { v23.h }[4], [x23]\n" + "ld1 { v19.h }[4], [x22]\n" + "ld1 { v18.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr s28, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "mov x19, #0x1\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s26, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s24, [x20], #0x4\n" + "ldr s28, [x28], #0x4\n" + "ldr s27, [x27], #0x4\n" + "mov x20, #0x1\n" + "ldr s22, [x26], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s24, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v28.h }[2], [x27]\n" - "mov x19, #0x2\n" - "ld1 { v29.h }[2], [x26]\n" - "ld1 { v25.h }[2], [x25]\n" - "ld1 { v21.h }[2], [x24]\n" - "ld1 { v27.h }[2], [x23]\n" - "ld1 { v26.h }[2], [x22]\n" - "ld1 { v19.h }[2], [x21]\n" - "ld1 { v24.h }[2], [x20]\n" + "ld1 { v28.h }[2], [x28]\n" + "ld1 { v27.h }[2], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v21.h }[2], [x25]\n" + "ld1 { v24.h }[2], [x24]\n" + "ld1 { v23.h }[2], [x23]\n" + "ld1 { v19.h }[2], [x22]\n" + "ld1 { v18.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr h28, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr h29, [x26, #0x0]\n" - "ldr h25, [x25, #0x0]\n" - "ldr h21, [x24, #0x0]\n" - "ldr h27, [x23, #0x0]\n" - "ldr h26, [x22, #0x0]\n" - "ldr h19, [x21, #0x0]\n" - "ldr h24, [x20, #0x0]\n" + "ldr h28, [x28, #0x0]\n" + "ldr h27, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr h22, [x26, #0x0]\n" + "ldr h21, [x25, #0x0]\n" + "ldr h24, [x24, #0x0]\n" + "ldr h23, [x23, #0x0]\n" + "ldr h19, [x22, #0x0]\n" + "ldr h18, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v22.4s, v28.4s, v25.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "zip1 v23.4s, v22.4s, v20.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v27.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v24.4s\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" + "zip1 v26.4s, v28.4s, v22.4s\n" + "zip1 v25.4s, v27.4s, v21.4s\n" + "subs x20, x20, #0x1\n" + "zip1 v20.4s, v24.4s, v19.4s\n" + "zip1 v17.4s, v23.4s, v18.4s\n" + "zip1 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v22.4s, v22.4s, v20.4s\n" - "str q22, [%x[out_ptr], #0x0]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" - "subs x19, x19, #0x1\n" - "str q20, [%x[out_ptr], #0x10]\n" + "subs x20, x20, #0x1\n" + "zip2 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v21.4s, v28.4s, v25.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "subs x20, x20, #0x1\n" + "zip2 v19.4s, v24.4s, v19.4s\n" + "zip2 v18.4s, v23.4s, v18.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v17.4s, v28.4s, v25.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "8:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp index 064207c0fa..358b83ad1b 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 2, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #2\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #2\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #2\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #2\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #2\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #2\n" "add x22, x22, %x[row_offset], LSL #2\n" "add x21, x21, %x[row_offset], LSL #2\n" - "add x20, x20, %x[row_offset], LSL #2\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,104 +77,103 @@ void interleave_block<8, 2, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q27, [x27], #0x10\n" + "ldr q26, [x28], #0x10\n" + "ldr q21, [x27], #0x10\n" "subs %x[width], %x[width], #0x4\n" - "ldr q24, [x26], #0x10\n" - "zip1 v26.2d, v27.2d, v24.2d\n" - "ldr q25, [x25], #0x10\n" "cmp %x[width], #0x4\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "ldr q21, [x24], #0x10\n" - "ldr q23, [x23], #0x10\n" - "zip1 v22.2d, v25.2d, v21.2d\n" - "ldr q18, [x22], #0x10\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "ldr q20, [x21], #0x10\n" - "ldr q16, [x20], #0x10\n" - "zip1 v19.2d, v23.2d, v18.2d\n" + "ldr q25, [x26], #0x10\n" + "ldr q24, [x25], #0x10\n" + "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v18.2d, v25.2d, v24.2d\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x23], #0x10\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip2 v21.2d, v26.2d, v21.2d\n" + "ldr q20, [x22], #0x10\n" + "ldr q19, [x21], #0x10\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.2d, v20.2d, v19.2d\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v18.2d, v23.2d, v18.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v18.2d, v25.2d, v24.2d\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v17.2d, v20.2d, v16.2d\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v16.2d, v20.2d, v16.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v17.2d, v23.2d, v22.2d\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v16.2d, v20.2d, v19.2d\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q26, [%x[out_ptr], #0x0]\n" - "str q22, [%x[out_ptr], #0x10]\n" - "str q19, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" - "str q24, [%x[out_ptr], #0x40]\n" - "str q21, [%x[out_ptr], #0x50]\n" - "str q18, [%x[out_ptr], #0x60]\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr d27, [x27], #0x8\n" - "ldr d24, [x26], #0x8\n" - "mov x19, #0x1\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d23, [x23], #0x8\n" - "ldr d18, [x22], #0x8\n" - "ldr d20, [x21], #0x8\n" - "ldr d16, [x20], #0x8\n" + "ldr d26, [x28], #0x8\n" + "ldr d21, [x27], #0x8\n" + "mov x20, #0x1\n" + "ldr d25, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" "tbz %x[width], #0, 5f\n" - "ld1 { v27.s }[2], [x27]\n" - "mov x19, #0x2\n" - "ld1 { v24.s }[2], [x26]\n" - "ld1 { v25.s }[2], [x25]\n" - "ld1 { v21.s }[2], [x24]\n" - "ld1 { v23.s }[2], [x23]\n" - "ld1 { v18.s }[2], [x22]\n" - "ld1 { v20.s }[2], [x21]\n" - "ld1 { v16.s }[2], [x20]\n" + "ld1 { v26.s }[2], [x28]\n" + "ld1 { v21.s }[2], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v25.s }[2], [x26]\n" + "ld1 { v24.s }[2], [x25]\n" + "ld1 { v23.s }[2], [x24]\n" + "ld1 { v22.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" + "ld1 { v19.s }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr s27, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr s24, [x26, #0x0]\n" - "ldr s25, [x25, #0x0]\n" - "ldr s21, [x24, #0x0]\n" - "ldr s23, [x23, #0x0]\n" - "ldr s18, [x22, #0x0]\n" - "ldr s20, [x21, #0x0]\n" - "ldr s16, [x20, #0x0]\n" + "ldr s26, [x28, #0x0]\n" + "ldr s21, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr s25, [x26, #0x0]\n" + "ldr s24, [x25, #0x0]\n" + "ldr s23, [x24, #0x0]\n" + "ldr s22, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" + "ldr s19, [x21, #0x0]\n" "5:" // Odd load end - "zip1 v26.2d, v27.2d, v24.2d\n" - "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v22.2d, v25.2d, v21.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v19.2d, v23.2d, v18.2d\n" - "str q22, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v20.2d, v16.2d\n" - "str q19, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" + "subs x20, x20, #0x1\n" + "zip1 v16.2d, v26.2d, v21.2d\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.2d, v25.2d, v24.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip1 v16.2d, v20.2d, v19.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 6f\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "zip2 v18.2d, v23.2d, v18.2d\n" - "str q21, [%x[out_ptr], #0x10]\n" - "zip2 v16.2d, v20.2d, v16.2d\n" - "str q18, [%x[out_ptr], #0x20]\n" + "zip2 v21.2d, v26.2d, v21.2d\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v25.2d, v24.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v17.2d, v23.2d, v22.2d\n" + "zip2 v16.2d, v20.2d, v19.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "6:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp index 1f86722bc1..d606d5a5b6 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 4, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #1\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #1\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #1\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #1\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #1\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #1\n" "add x22, x22, %x[row_offset], LSL #1\n" "add x21, x21, %x[row_offset], LSL #1\n" - "add x20, x20, %x[row_offset], LSL #1\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x8\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,146 +77,145 @@ void interleave_block<8, 4, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q27, [x27], #0x10\n" + "ldr q26, [x28], #0x10\n" + "ldr q21, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" - "ldr q24, [x26], #0x10\n" - "zip1 v26.2d, v27.2d, v24.2d\n" - "ldr q25, [x25], #0x10\n" "cmp %x[width], #0x8\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "ldr q21, [x24], #0x10\n" - "ldr q23, [x23], #0x10\n" - "zip1 v22.2d, v25.2d, v21.2d\n" - "ldr q18, [x22], #0x10\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "ldr q20, [x21], #0x10\n" - "ldr q16, [x20], #0x10\n" - "zip1 v19.2d, v23.2d, v18.2d\n" + "ldr q25, [x26], #0x10\n" + "ldr q24, [x25], #0x10\n" + "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v18.2d, v25.2d, v24.2d\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x23], #0x10\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip2 v21.2d, v26.2d, v21.2d\n" + "ldr q20, [x22], #0x10\n" + "ldr q19, [x21], #0x10\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.2d, v20.2d, v19.2d\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v18.2d, v23.2d, v18.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v18.2d, v25.2d, v24.2d\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v17.2d, v20.2d, v16.2d\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v16.2d, v20.2d, v16.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v17.2d, v23.2d, v22.2d\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v16.2d, v20.2d, v19.2d\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q26, [%x[out_ptr], #0x0]\n" - "str q22, [%x[out_ptr], #0x10]\n" - "str q19, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" - "str q24, [%x[out_ptr], #0x40]\n" - "str q21, [%x[out_ptr], #0x50]\n" - "str q18, [%x[out_ptr], #0x60]\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr d27, [x27], #0x8\n" - "ldr d24, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d23, [x23], #0x8\n" - "ldr d18, [x22], #0x8\n" - "ldr d20, [x21], #0x8\n" - "ldr d16, [x20], #0x8\n" + "ldr d26, [x28], #0x8\n" + "ldr d21, [x27], #0x8\n" + "ldr d25, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" - "ld1 { v27.s }[2], [x27], #0x4\n" - "mov x19, #0x2\n" - "ld1 { v24.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v23.s }[2], [x23], #0x4\n" - "ld1 { v18.s }[2], [x22], #0x4\n" - "ld1 { v20.s }[2], [x21], #0x4\n" - "ld1 { v16.s }[2], [x20], #0x4\n" + "ld1 { v26.s }[2], [x28], #0x4\n" + "ld1 { v21.s }[2], [x27], #0x4\n" + "mov x20, #0x2\n" + "ld1 { v25.s }[2], [x26], #0x4\n" + "ld1 { v24.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v27.h }[6], [x27]\n" - "ld1 { v24.h }[6], [x26]\n" - "ld1 { v25.h }[6], [x25]\n" - "ld1 { v21.h }[6], [x24]\n" - "ld1 { v23.h }[6], [x23]\n" - "ld1 { v18.h }[6], [x22]\n" - "ld1 { v20.h }[6], [x21]\n" - "ld1 { v16.h }[6], [x20]\n" + "ld1 { v26.h }[6], [x28]\n" + "ld1 { v21.h }[6], [x27]\n" + "ld1 { v25.h }[6], [x26]\n" + "ld1 { v24.h }[6], [x25]\n" + "ld1 { v23.h }[6], [x24]\n" + "ld1 { v22.h }[6], [x23]\n" + "ld1 { v20.h }[6], [x22]\n" + "ld1 { v19.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 7f\n" - "ld1 { v27.h }[4], [x27]\n" - "ld1 { v24.h }[4], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.h }[4], [x25]\n" - "ld1 { v21.h }[4], [x24]\n" - "ld1 { v23.h }[4], [x23]\n" - "ld1 { v18.h }[4], [x22]\n" - "ld1 { v20.h }[4], [x21]\n" - "ld1 { v16.h }[4], [x20]\n" + "ld1 { v26.h }[4], [x28]\n" + "ld1 { v21.h }[4], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[4], [x26]\n" + "ld1 { v24.h }[4], [x25]\n" + "ld1 { v23.h }[4], [x24]\n" + "ld1 { v22.h }[4], [x23]\n" + "ld1 { v20.h }[4], [x22]\n" + "ld1 { v19.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr s27, [x27], #0x4\n" - "ldr s24, [x26], #0x4\n" - "mov x19, #0x1\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s23, [x23], #0x4\n" - "ldr s18, [x22], #0x4\n" - "ldr s20, [x21], #0x4\n" - "ldr s16, [x20], #0x4\n" + "ldr s26, [x28], #0x4\n" + "ldr s21, [x27], #0x4\n" + "mov x20, #0x1\n" + "ldr s25, [x26], #0x4\n" + "ldr s24, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v27.h }[2], [x27]\n" - "ld1 { v24.h }[2], [x26]\n" - "ld1 { v25.h }[2], [x25]\n" - "ld1 { v21.h }[2], [x24]\n" - "ld1 { v23.h }[2], [x23]\n" - "ld1 { v18.h }[2], [x22]\n" - "ld1 { v20.h }[2], [x21]\n" - "ld1 { v16.h }[2], [x20]\n" + "ld1 { v26.h }[2], [x28]\n" + "ld1 { v21.h }[2], [x27]\n" + "ld1 { v25.h }[2], [x26]\n" + "ld1 { v24.h }[2], [x25]\n" + "ld1 { v23.h }[2], [x24]\n" + "ld1 { v22.h }[2], [x23]\n" + "ld1 { v20.h }[2], [x22]\n" + "ld1 { v19.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr h27, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr h24, [x26, #0x0]\n" - "ldr h25, [x25, #0x0]\n" - "ldr h21, [x24, #0x0]\n" - "ldr h23, [x23, #0x0]\n" - "ldr h18, [x22, #0x0]\n" - "ldr h20, [x21, #0x0]\n" - "ldr h16, [x20, #0x0]\n" + "ldr h26, [x28, #0x0]\n" + "ldr h21, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr h25, [x26, #0x0]\n" + "ldr h24, [x25, #0x0]\n" + "ldr h23, [x24, #0x0]\n" + "ldr h22, [x23, #0x0]\n" + "ldr h20, [x22, #0x0]\n" + "ldr h19, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v26.2d, v27.2d, v24.2d\n" - "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v22.2d, v25.2d, v21.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v19.2d, v23.2d, v18.2d\n" - "str q22, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v20.2d, v16.2d\n" - "str q19, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" + "subs x20, x20, #0x1\n" + "zip1 v16.2d, v26.2d, v21.2d\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.2d, v25.2d, v24.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip1 v16.2d, v20.2d, v19.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 8f\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "zip2 v18.2d, v23.2d, v18.2d\n" - "str q21, [%x[out_ptr], #0x10]\n" - "zip2 v16.2d, v20.2d, v16.2d\n" - "str q18, [%x[out_ptr], #0x20]\n" + "zip2 v21.2d, v26.2d, v21.2d\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v25.2d, v24.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v17.2d, v23.2d, v22.2d\n" + "zip2 v16.2d, v20.2d, v19.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "8:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp index 533682c647..dfec14358b 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 4, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset], LSL #2\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset], LSL #2\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset], LSL #2\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset], LSL #2\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset], LSL #2\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset], LSL #2\n" "add x22, x22, %x[row_offset], LSL #2\n" "add x21, x21, %x[row_offset], LSL #2\n" - "add x20, x20, %x[row_offset], LSL #2\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x4\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,92 +77,91 @@ void interleave_block<8, 4, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q23, [x27], #0x10\n" - ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" + "ldr q23, [x28], #0x10\n" "ldr q22, [x26], #0x10\n" + ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" + ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" + "ldr q21, [x24], #0x10\n" + "ldr q20, [x22], #0x10\n" + ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" + ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" + "ldr q19, [x27], #0x10\n" + "ldr q18, [x25], #0x10\n" "subs %x[width], %x[width], #0x4\n" - ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n" - "ldr q21, [x25], #0x10\n" "cmp %x[width], #0x4\n" - ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" - "ldr q20, [x24], #0x10\n" - "ldr q18, [x23], #0x10\n" - ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n" - "ldr q19, [x22], #0x10\n" - ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" + "ldr q17, [x23], #0x10\n" "ldr q16, [x21], #0x10\n" - "ldr q17, [x20], #0x10\n" - ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + ".inst 0x4ea16a77 // bfcvtn2 v23.8h, v19.4s\n" + ".inst 0x4ea16a56 // bfcvtn2 v22.8h, v18.4s\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" + ".inst 0x4ea16a35 // bfcvtn2 v21.8h, v17.4s\n" + ".inst 0x4ea16a14 // bfcvtn2 v20.8h, v16.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q23, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" + "str q22, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q23, [%x[out_ptr], #0x0]\n" - "str q21, [%x[out_ptr], #0x10]\n" - "str q18, [%x[out_ptr], #0x20]\n" - "str q16, [%x[out_ptr], #0x30]\n" + "str q21, [%x[out_ptr], #0x20]\n" + "str q20, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr d23, [x27], #0x8\n" + "ldr d23, [x28], #0x8\n" + "ldr d19, [x27], #0x8\n" + "mov x20, #0x1\n" "ldr d22, [x26], #0x8\n" - "mov x19, #0x1\n" - "ldr d21, [x25], #0x8\n" - "ldr d20, [x24], #0x8\n" - "ldr d18, [x23], #0x8\n" - "ldr d19, [x22], #0x8\n" + "ldr d18, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" "ldr d16, [x21], #0x8\n" - "ldr d17, [x20], #0x8\n" "tbz %x[width], #0, 5f\n" - "ld1 { v23.s }[2], [x27]\n" + "ld1 { v23.s }[2], [x28]\n" + "ld1 { v19.s }[2], [x27]\n" "ld1 { v22.s }[2], [x26]\n" - "ld1 { v21.s }[2], [x25]\n" - "ld1 { v20.s }[2], [x24]\n" - "ld1 { v18.s }[2], [x23]\n" - "ld1 { v19.s }[2], [x22]\n" + "ld1 { v18.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "ld1 { v17.s }[2], [x23]\n" + "ld1 { v20.s }[2], [x22]\n" "ld1 { v16.s }[2], [x21]\n" - "ld1 { v17.s }[2], [x20]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr s23, [x27, #0x0]\n" - "mov x19, #0x1\n" + "ldr s23, [x28, #0x0]\n" + "ldr s19, [x27, #0x0]\n" + "mov x20, #0x1\n" "ldr s22, [x26, #0x0]\n" - "ldr s21, [x25, #0x0]\n" - "ldr s20, [x24, #0x0]\n" - "ldr s18, [x23, #0x0]\n" - "ldr s19, [x22, #0x0]\n" + "ldr s18, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "ldr s17, [x23, #0x0]\n" + "ldr s20, [x22, #0x0]\n" "ldr s16, [x21, #0x0]\n" - "ldr s17, [x20, #0x0]\n" "5:" // Odd load end ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" + ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" - ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" - ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" - ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n" + ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" + ".inst 0x4ea16a77 // bfcvtn2 v23.8h, v19.4s\n" + ".inst 0x4ea16a56 // bfcvtn2 v22.8h, v18.4s\n" "str q23, [%x[out_ptr], #0x0]\n" - ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n" - ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" - "str q21, [%x[out_ptr], #0x10]\n" - ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" - "str q18, [%x[out_ptr], #0x20]\n" - "str q16, [%x[out_ptr], #0x30]\n" + ".inst 0x4ea16a35 // bfcvtn2 v21.8h, v17.4s\n" + ".inst 0x4ea16a14 // bfcvtn2 v20.8h, v16.4s\n" + "str q22, [%x[out_ptr], #0x10]\n" + "str q21, [%x[out_ptr], #0x20]\n" + "str q20, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "6:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp index 659d9947e2..54f15f8a5c 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 4, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset]\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset]\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset]\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x10\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,49 +77,48 @@ void interleave_block<8, 4, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q28, [x27], #0x10\n" + "ldr q28, [x28], #0x10\n" + "ldr q27, [x27], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q29, [x26], #0x10\n" "cmp %x[width], #0x10\n" - "ldr q25, [x25], #0x10\n" - "zip1 v22.4s, v28.4s, v25.4s\n" - "ldr q21, [x24], #0x10\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "ldr q27, [x23], #0x10\n" - "ldr q26, [x22], #0x10\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "ldr q19, [x21], #0x10\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "ldr q24, [x20], #0x10\n" - "zip1 v23.4s, v22.4s, v20.4s\n" + "ldr q22, [x26], #0x10\n" + "ldr q21, [x25], #0x10\n" + "zip1 v26.4s, v28.4s, v22.4s\n" + "zip1 v25.4s, v27.4s, v21.4s\n" + "ldr q24, [x24], #0x10\n" + "ldr q23, [x23], #0x10\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "ldr q19, [x22], #0x10\n" + "ldr q18, [x21], #0x10\n" + "zip1 v20.4s, v24.4s, v19.4s\n" + "zip1 v17.4s, v23.4s, v18.4s\n" + "zip2 v19.4s, v24.4s, v19.4s\n" + "zip2 v18.4s, v23.4s, v18.4s\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v22.4s, v22.4s, v20.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v18.4s, v27.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v16.4s, v26.4s, v24.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "zip1 v17.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "zip2 v17.4s, v28.4s, v25.4s\n" - "str q22, [%x[out_ptr], #0x20]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" - "str q21, [%x[out_ptr], #0x40]\n" - "str q18, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x20]\n" + "zip2 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x40]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x50]\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" @@ -126,204 +126,204 @@ void interleave_block<8, 4, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 12f\n" "tbz %x[width], #3, 7f\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d26, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d24, [x20], #0x8\n" + "ldr d28, [x28], #0x8\n" + "ldr d27, [x27], #0x8\n" + "ldr d22, [x26], #0x8\n" + "ldr d21, [x25], #0x8\n" + "ldr d24, [x24], #0x8\n" + "ldr d23, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #2, 5f\n" - "ld1 { v28.s }[2], [x27], #0x4\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v26.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v24.s }[2], [x20], #0x4\n" + "ld1 { v28.s }[2], [x28], #0x4\n" + "ld1 { v27.s }[2], [x27], #0x4\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "ld1 { v21.s }[2], [x25], #0x4\n" + "ld1 { v24.s }[2], [x24], #0x4\n" + "ld1 { v23.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v28.h }[6], [x27], #0x2\n" - "mov x19, #0x4\n" - "ld1 { v29.h }[6], [x26], #0x2\n" - "ld1 { v25.h }[6], [x25], #0x2\n" - "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v27.h }[6], [x23], #0x2\n" - "ld1 { v26.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v24.h }[6], [x20], #0x2\n" + "ld1 { v28.h }[6], [x28], #0x2\n" + "ld1 { v27.h }[6], [x27], #0x2\n" + "mov x20, #0x4\n" + "ld1 { v22.h }[6], [x26], #0x2\n" + "ld1 { v21.h }[6], [x25], #0x2\n" + "ld1 { v24.h }[6], [x24], #0x2\n" + "ld1 { v23.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[14], [x27]\n" - "ld1 { v29.b }[14], [x26]\n" - "ld1 { v25.b }[14], [x25]\n" - "ld1 { v21.b }[14], [x24]\n" - "ld1 { v27.b }[14], [x23]\n" - "ld1 { v26.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" - "ld1 { v24.b }[14], [x20]\n" + "ld1 { v28.b }[14], [x28]\n" + "ld1 { v27.b }[14], [x27]\n" + "ld1 { v22.b }[14], [x26]\n" + "ld1 { v21.b }[14], [x25]\n" + "ld1 { v24.b }[14], [x24]\n" + "ld1 { v23.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v18.b }[14], [x21]\n" "b 11f\n" "4:" // odd_loads_1_12 - "mov x19, #0x3\n" + "mov x20, #0x3\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[12], [x27]\n" - "ld1 { v29.b }[12], [x26]\n" - "mov x19, #0x4\n" - "ld1 { v25.b }[12], [x25]\n" - "ld1 { v21.b }[12], [x24]\n" - "ld1 { v27.b }[12], [x23]\n" - "ld1 { v26.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" - "ld1 { v24.b }[12], [x20]\n" + "ld1 { v28.b }[12], [x28]\n" + "ld1 { v27.b }[12], [x27]\n" + "mov x20, #0x4\n" + "ld1 { v22.b }[12], [x26]\n" + "ld1 { v21.b }[12], [x25]\n" + "ld1 { v24.b }[12], [x24]\n" + "ld1 { v23.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v18.b }[12], [x21]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" - "ld1 { v28.h }[4], [x27], #0x2\n" - "ld1 { v29.h }[4], [x26], #0x2\n" - "mov x19, #0x3\n" - "ld1 { v25.h }[4], [x25], #0x2\n" - "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v27.h }[4], [x23], #0x2\n" - "ld1 { v26.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v24.h }[4], [x20], #0x2\n" + "ld1 { v28.h }[4], [x28], #0x2\n" + "ld1 { v27.h }[4], [x27], #0x2\n" + "mov x20, #0x3\n" + "ld1 { v22.h }[4], [x26], #0x2\n" + "ld1 { v21.h }[4], [x25], #0x2\n" + "ld1 { v24.h }[4], [x24], #0x2\n" + "ld1 { v23.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[10], [x27]\n" - "ld1 { v29.b }[10], [x26]\n" - "ld1 { v25.b }[10], [x25]\n" - "ld1 { v21.b }[10], [x24]\n" - "ld1 { v27.b }[10], [x23]\n" - "ld1 { v26.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" - "ld1 { v24.b }[10], [x20]\n" + "ld1 { v28.b }[10], [x28]\n" + "ld1 { v27.b }[10], [x27]\n" + "ld1 { v22.b }[10], [x26]\n" + "ld1 { v21.b }[10], [x25]\n" + "ld1 { v24.b }[10], [x24]\n" + "ld1 { v23.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v18.b }[10], [x21]\n" "b 11f\n" "6:" // odd_loads_1_8 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[8], [x27]\n" - "ld1 { v29.b }[8], [x26]\n" - "mov x19, #0x3\n" - "ld1 { v25.b }[8], [x25]\n" - "ld1 { v21.b }[8], [x24]\n" - "ld1 { v27.b }[8], [x23]\n" - "ld1 { v26.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" - "ld1 { v24.b }[8], [x20]\n" + "ld1 { v28.b }[8], [x28]\n" + "ld1 { v27.b }[8], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v22.b }[8], [x26]\n" + "ld1 { v21.b }[8], [x25]\n" + "ld1 { v24.b }[8], [x24]\n" + "ld1 { v23.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v18.b }[8], [x21]\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" - "ldr s28, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s26, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s24, [x20], #0x4\n" + "ldr s28, [x28], #0x4\n" + "ldr s27, [x27], #0x4\n" + "ldr s22, [x26], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s24, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" "tbz %x[width], #1, 8f\n" - "ld1 { v28.h }[2], [x27], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "ld1 { v26.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v24.h }[2], [x20], #0x2\n" + "ld1 { v28.h }[2], [x28], #0x2\n" + "ld1 { v27.h }[2], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v22.h }[2], [x26], #0x2\n" + "ld1 { v21.h }[2], [x25], #0x2\n" + "ld1 { v24.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[6], [x27]\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v25.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v27.b }[6], [x23]\n" - "ld1 { v26.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" - "ld1 { v24.b }[6], [x20]\n" + "ld1 { v28.b }[6], [x28]\n" + "ld1 { v27.b }[6], [x27]\n" + "ld1 { v22.b }[6], [x26]\n" + "ld1 { v21.b }[6], [x25]\n" + "ld1 { v24.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v18.b }[6], [x21]\n" "b 11f\n" "8:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[4], [x27]\n" - "ld1 { v29.b }[4], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v27.b }[4], [x23]\n" - "ld1 { v26.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" - "ld1 { v24.b }[4], [x20]\n" + "ld1 { v28.b }[4], [x28]\n" + "ld1 { v27.b }[4], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v22.b }[4], [x26]\n" + "ld1 { v21.b }[4], [x25]\n" + "ld1 { v24.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v18.b }[4], [x21]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" - "ldr h28, [x27], #0x2\n" - "ldr h29, [x26], #0x2\n" - "mov x19, #0x1\n" - "ldr h25, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h27, [x23], #0x2\n" - "ldr h26, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" - "ldr h24, [x20], #0x2\n" + "ldr h28, [x28], #0x2\n" + "ldr h27, [x27], #0x2\n" + "mov x20, #0x1\n" + "ldr h22, [x26], #0x2\n" + "ldr h21, [x25], #0x2\n" + "ldr h24, [x24], #0x2\n" + "ldr h23, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "ldr h18, [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v28.b }[2], [x27]\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v25.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v27.b }[2], [x23]\n" - "ld1 { v26.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" - "ld1 { v24.b }[2], [x20]\n" + "ld1 { v28.b }[2], [x28]\n" + "ld1 { v27.b }[2], [x27]\n" + "ld1 { v22.b }[2], [x26]\n" + "ld1 { v21.b }[2], [x25]\n" + "ld1 { v24.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v18.b }[2], [x21]\n" "b 11f\n" "10:" // odd_loads_1_0 - "ldr b28, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b25, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b27, [x23, #0x0]\n" - "ldr b26, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" - "ldr b24, [x20, #0x0]\n" + "ldr b28, [x28, #0x0]\n" + "ldr b27, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b22, [x26, #0x0]\n" + "ldr b21, [x25, #0x0]\n" + "ldr b24, [x24, #0x0]\n" + "ldr b23, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "ldr b18, [x21, #0x0]\n" "11:" // Odd load end - "zip1 v22.4s, v28.4s, v25.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "zip1 v23.4s, v22.4s, v20.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v27.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v24.4s\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" + "zip1 v26.4s, v28.4s, v22.4s\n" + "zip1 v25.4s, v27.4s, v21.4s\n" + "subs x20, x20, #0x1\n" + "zip1 v20.4s, v24.4s, v19.4s\n" + "zip1 v17.4s, v23.4s, v18.4s\n" + "zip1 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v22.4s, v22.4s, v20.4s\n" - "str q22, [%x[out_ptr], #0x0]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" - "subs x19, x19, #0x1\n" - "str q20, [%x[out_ptr], #0x10]\n" + "subs x20, x20, #0x1\n" + "zip2 v16.4s, v26.4s, v25.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v20.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v21.4s, v28.4s, v25.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v22.4s, v28.4s, v22.4s\n" + "zip2 v21.4s, v27.4s, v21.4s\n" + "subs x20, x20, #0x1\n" + "zip2 v19.4s, v24.4s, v19.4s\n" + "zip2 v18.4s, v23.4s, v18.4s\n" + "zip1 v16.4s, v22.4s, v21.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v17.4s, v28.4s, v25.4s\n" + "zip2 v17.4s, v22.4s, v21.4s\n" "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "12:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp index dfec94c952..2db54126c0 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,40 +31,41 @@ void interleave_block<8, 4, VLType::None, true>( ) { __asm__ __volatile__( - "movi v1.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" - "movi v0.8h, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "movi v2.8h, #0x0\n" + "movi v1.8h, #0x0\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" + "movi v0.4s, #0x0\n" "movi v31.4s, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" + "add x28, x28, %x[row_offset]\n" "add x27, x27, %x[row_offset]\n" - "movi v30.4s, #0x0\n" - "ldr x24, [%x[in], #0x18]\n" - "ldr x23, [%x[in], #0x20]\n" "add x26, x26, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -72,7 +73,7 @@ void interleave_block<8, 4, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -80,71 +81,70 @@ void interleave_block<8, 4, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" - "ld1 { v31.4s }, [%x[out_ptr]]\n" - "ldr q30, [%x[out_ptr], #0x10]\n" + "ld1 { v0.4s }, [%x[out_ptr]]\n" + "ldr q31, [%x[out_ptr], #0x10]\n" "2:" // first_pass "cmp %x[width], #0x10\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0x1e\n" + "cmp x20, #0x1e\n" "ble 4f\n" + "sadalp v0.4s, v2.8h\n" + "movi v2.8h, #0x0\n" + "mov x20, #0x0\n" "sadalp v31.4s, v1.8h\n" "movi v1.8h, #0x0\n" - "sadalp v30.4s, v0.8h\n" - "movi v0.8h, #0x0\n" - "mov x19, #0x0\n" "4:" // no_accumulate_16 - "ldr q28, [x27], #0x10\n" - "add x19, x19, #0x1\n" - "ldr q29, [x26], #0x10\n" + "ldr q30, [x28], #0x10\n" + "ldr q29, [x27], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q25, [x25], #0x10\n" - "zip1 v22.4s, v28.4s, v25.4s\n" - "ldr q21, [x24], #0x10\n" "cmp %x[width], #0x10\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "ldr q27, [x23], #0x10\n" - "ldr q26, [x22], #0x10\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "ldr q19, [x21], #0x10\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "ldr q24, [x20], #0x10\n" - "zip1 v23.4s, v22.4s, v20.4s\n" + "ldr q28, [x26], #0x10\n" + "ldr q27, [x25], #0x10\n" + "zip1 v22.4s, v30.4s, v28.4s\n" + "zip1 v21.4s, v29.4s, v27.4s\n" + "ldr q20, [x24], #0x10\n" + "ldr q26, [x23], #0x10\n" + "zip1 v25.4s, v22.4s, v21.4s\n" + "sadalp v2.8h, v25.16b\n" + "ldr q19, [x22], #0x10\n" + "ldr q18, [x21], #0x10\n" + "zip1 v17.4s, v20.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v18.4s\n" + "zip1 v24.4s, v17.4s, v16.4s\n" + "sadalp v1.8h, v24.16b\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "sadalp v1.8h, v23.16b\n" - "zip2 v22.4s, v22.4s, v20.4s\n" + "zip2 v23.4s, v22.4s, v21.4s\n" + "zip2 v22.4s, v17.4s, v16.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "sadalp v1.8h, v22.16b\n" - "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v16.4s, v26.4s, v24.4s\n" + "zip2 v21.4s, v30.4s, v28.4s\n" + "zip2 v17.4s, v29.4s, v27.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "sadalp v1.8h, v21.16b\n" - "zip1 v17.4s, v18.4s, v16.4s\n" + "zip2 v20.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v18.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "sadalp v0.8h, v17.16b\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "sadalp v0.8h, v20.16b\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "sadalp v0.8h, v18.16b\n" - "zip2 v17.4s, v28.4s, v25.4s\n" - "str q22, [%x[out_ptr], #0x20]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" - "sadalp v1.8h, v17.16b\n" - "str q21, [%x[out_ptr], #0x40]\n" - "sadalp v0.8h, v16.16b\n" + "sadalp v2.8h, v23.16b\n" + "sadalp v1.8h, v22.16b\n" + "str q25, [%x[out_ptr], #0x0]\n" + "add x20, x20, #0x1\n" + "zip1 v19.4s, v21.4s, v17.4s\n" + "zip1 v18.4s, v20.4s, v16.4s\n" + "str q24, [%x[out_ptr], #0x10]\n" + "sadalp v2.8h, v19.16b\n" + "sadalp v1.8h, v18.16b\n" + "str q23, [%x[out_ptr], #0x20]\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip2 v16.4s, v20.4s, v16.4s\n" + "str q22, [%x[out_ptr], #0x30]\n" + "str q19, [%x[out_ptr], #0x40]\n" + "sadalp v2.8h, v17.16b\n" + "sadalp v1.8h, v16.16b\n" "str q18, [%x[out_ptr], #0x50]\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" @@ -153,216 +153,216 @@ void interleave_block<8, 4, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d26, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d24, [x20], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v28.s }[2], [x27], #0x4\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v26.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v24.s }[2], [x20], #0x4\n" + "ld1 { v30.s }[2], [x28], #0x4\n" + "ld1 { v29.s }[2], [x27], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v20.s }[2], [x24], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v28.h }[6], [x27], #0x2\n" - "mov x19, #0x4\n" - "ld1 { v29.h }[6], [x26], #0x2\n" - "ld1 { v25.h }[6], [x25], #0x2\n" - "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v27.h }[6], [x23], #0x2\n" - "ld1 { v26.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v24.h }[6], [x20], #0x2\n" + "ld1 { v30.h }[6], [x28], #0x2\n" + "ld1 { v29.h }[6], [x27], #0x2\n" + "mov x20, #0x4\n" + "ld1 { v28.h }[6], [x26], #0x2\n" + "ld1 { v27.h }[6], [x25], #0x2\n" + "ld1 { v20.h }[6], [x24], #0x2\n" + "ld1 { v26.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[14], [x27]\n" - "ld1 { v29.b }[14], [x26]\n" - "ld1 { v25.b }[14], [x25]\n" - "ld1 { v21.b }[14], [x24]\n" - "ld1 { v27.b }[14], [x23]\n" - "ld1 { v26.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" - "ld1 { v24.b }[14], [x20]\n" + "ld1 { v30.b }[14], [x28]\n" + "ld1 { v29.b }[14], [x27]\n" + "ld1 { v28.b }[14], [x26]\n" + "ld1 { v27.b }[14], [x25]\n" + "ld1 { v20.b }[14], [x24]\n" + "ld1 { v26.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v18.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 - "mov x19, #0x3\n" + "mov x20, #0x3\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[12], [x27]\n" - "ld1 { v29.b }[12], [x26]\n" - "mov x19, #0x4\n" - "ld1 { v25.b }[12], [x25]\n" - "ld1 { v21.b }[12], [x24]\n" - "ld1 { v27.b }[12], [x23]\n" - "ld1 { v26.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" - "ld1 { v24.b }[12], [x20]\n" + "ld1 { v30.b }[12], [x28]\n" + "ld1 { v29.b }[12], [x27]\n" + "mov x20, #0x4\n" + "ld1 { v28.b }[12], [x26]\n" + "ld1 { v27.b }[12], [x25]\n" + "ld1 { v20.b }[12], [x24]\n" + "ld1 { v26.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v18.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v28.h }[4], [x27], #0x2\n" - "ld1 { v29.h }[4], [x26], #0x2\n" - "mov x19, #0x3\n" - "ld1 { v25.h }[4], [x25], #0x2\n" - "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v27.h }[4], [x23], #0x2\n" - "ld1 { v26.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v24.h }[4], [x20], #0x2\n" + "ld1 { v30.h }[4], [x28], #0x2\n" + "ld1 { v29.h }[4], [x27], #0x2\n" + "mov x20, #0x3\n" + "ld1 { v28.h }[4], [x26], #0x2\n" + "ld1 { v27.h }[4], [x25], #0x2\n" + "ld1 { v20.h }[4], [x24], #0x2\n" + "ld1 { v26.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[10], [x27]\n" - "ld1 { v29.b }[10], [x26]\n" - "ld1 { v25.b }[10], [x25]\n" - "ld1 { v21.b }[10], [x24]\n" - "ld1 { v27.b }[10], [x23]\n" - "ld1 { v26.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" - "ld1 { v24.b }[10], [x20]\n" + "ld1 { v30.b }[10], [x28]\n" + "ld1 { v29.b }[10], [x27]\n" + "ld1 { v28.b }[10], [x26]\n" + "ld1 { v27.b }[10], [x25]\n" + "ld1 { v20.b }[10], [x24]\n" + "ld1 { v26.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v18.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[8], [x27]\n" - "ld1 { v29.b }[8], [x26]\n" - "mov x19, #0x3\n" - "ld1 { v25.b }[8], [x25]\n" - "ld1 { v21.b }[8], [x24]\n" - "ld1 { v27.b }[8], [x23]\n" - "ld1 { v26.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" - "ld1 { v24.b }[8], [x20]\n" + "ld1 { v30.b }[8], [x28]\n" + "ld1 { v29.b }[8], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v28.b }[8], [x26]\n" + "ld1 { v27.b }[8], [x25]\n" + "ld1 { v20.b }[8], [x24]\n" + "ld1 { v26.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v18.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s28, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s26, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s24, [x20], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v28.h }[2], [x27], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "ld1 { v26.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v24.h }[2], [x20], #0x2\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "ld1 { v29.h }[2], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v20.h }[2], [x24], #0x2\n" + "ld1 { v26.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[6], [x27]\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v25.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v27.b }[6], [x23]\n" - "ld1 { v26.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" - "ld1 { v24.b }[6], [x20]\n" + "ld1 { v30.b }[6], [x28]\n" + "ld1 { v29.b }[6], [x27]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v20.b }[6], [x24]\n" + "ld1 { v26.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v18.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[4], [x27]\n" - "ld1 { v29.b }[4], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v27.b }[4], [x23]\n" - "ld1 { v26.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" - "ld1 { v24.b }[4], [x20]\n" + "ld1 { v30.b }[4], [x28]\n" + "ld1 { v29.b }[4], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v20.b }[4], [x24]\n" + "ld1 { v26.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v18.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h28, [x27], #0x2\n" - "ldr h29, [x26], #0x2\n" - "mov x19, #0x1\n" - "ldr h25, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h27, [x23], #0x2\n" - "ldr h26, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" - "ldr h24, [x20], #0x2\n" + "ldr h30, [x28], #0x2\n" + "ldr h29, [x27], #0x2\n" + "mov x20, #0x1\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h20, [x24], #0x2\n" + "ldr h26, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "ldr h18, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[2], [x27]\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v25.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v27.b }[2], [x23]\n" - "ld1 { v26.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" - "ld1 { v24.b }[2], [x20]\n" + "ld1 { v30.b }[2], [x28]\n" + "ld1 { v29.b }[2], [x27]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v20.b }[2], [x24]\n" + "ld1 { v26.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v18.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b28, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b25, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b27, [x23, #0x0]\n" - "ldr b26, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" - "ldr b24, [x20, #0x0]\n" + "ldr b30, [x28, #0x0]\n" + "ldr b29, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b20, [x24, #0x0]\n" + "ldr b26, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "ldr b18, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v22.4s, v28.4s, v25.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "zip1 v23.4s, v22.4s, v20.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "sadalp v1.8h, v23.16b\n" - "zip1 v18.4s, v27.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v24.4s\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "sadalp v0.8h, v17.16b\n" + "zip1 v22.4s, v30.4s, v28.4s\n" + "zip1 v21.4s, v29.4s, v27.4s\n" + "subs x20, x20, #0x1\n" + "zip1 v17.4s, v20.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v18.4s\n" + "zip1 v25.4s, v22.4s, v21.4s\n" + "zip1 v24.4s, v17.4s, v16.4s\n" + "str q25, [%x[out_ptr], #0x0]\n" + "sadalp v2.8h, v25.16b\n" + "str q24, [%x[out_ptr], #0x10]\n" + "sadalp v1.8h, v24.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v22.4s, v22.4s, v20.4s\n" - "str q22, [%x[out_ptr], #0x0]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" + "zip2 v23.4s, v22.4s, v21.4s\n" + "zip2 v22.4s, v17.4s, v16.4s\n" + "subs x20, x20, #0x1\n" + "str q23, [%x[out_ptr], #0x0]\n" + "sadalp v2.8h, v23.16b\n" + "str q22, [%x[out_ptr], #0x10]\n" "sadalp v1.8h, v22.16b\n" - "str q20, [%x[out_ptr], #0x10]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - "sadalp v0.8h, v20.16b\n" "beq 14f\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v21.4s, v28.4s, v25.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "sadalp v1.8h, v21.16b\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "zip1 v18.4s, v19.4s, v16.4s\n" + "zip2 v21.4s, v30.4s, v28.4s\n" + "zip2 v17.4s, v29.4s, v27.4s\n" + "subs x20, x20, #0x1\n" + "zip2 v20.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v18.4s\n" + "zip1 v19.4s, v21.4s, v17.4s\n" + "zip1 v18.4s, v20.4s, v16.4s\n" + "str q19, [%x[out_ptr], #0x0]\n" + "sadalp v2.8h, v19.16b\n" "str q18, [%x[out_ptr], #0x10]\n" - "sadalp v0.8h, v18.16b\n" + "sadalp v1.8h, v18.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v17.4s, v28.4s, v25.4s\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip2 v16.4s, v20.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "sadalp v1.8h, v17.16b\n" + "sadalp v2.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x10]\n" + "sadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - "sadalp v0.8h, v16.16b\n" "14:" // Odds skip + "sadalp v0.4s, v2.8h\n" "sadalp v31.4s, v1.8h\n" - "str q31, [%x[out_ptr], #0x0]\n" - "sadalp v30.4s, v0.8h\n" - "str q30, [%x[out_ptr], #0x10]\n" + "str q0, [%x[out_ptr], #0x0]\n" + "str q31, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp index 1b94c7f1f1..44a79c0f0a 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,40 +31,41 @@ void interleave_block<8, 4, VLType::None, true>( ) { __asm__ __volatile__( - "movi v1.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" - "movi v0.8h, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" + "movi v2.8h, #0x0\n" + "movi v1.8h, #0x0\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" + "movi v0.4s, #0x0\n" "movi v31.4s, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" + "add x28, x28, %x[row_offset]\n" "add x27, x27, %x[row_offset]\n" - "movi v30.4s, #0x0\n" - "ldr x24, [%x[in], #0x18]\n" - "ldr x23, [%x[in], #0x20]\n" "add x26, x26, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -72,7 +73,7 @@ void interleave_block<8, 4, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -80,71 +81,70 @@ void interleave_block<8, 4, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" - "ld1 { v31.4s }, [%x[out_ptr]]\n" - "ldr q30, [%x[out_ptr], #0x10]\n" + "ld1 { v0.4s }, [%x[out_ptr]]\n" + "ldr q31, [%x[out_ptr], #0x10]\n" "2:" // first_pass "cmp %x[width], #0x10\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0x1e\n" + "cmp x20, #0x1e\n" "ble 4f\n" + "uadalp v0.4s, v2.8h\n" + "movi v2.8h, #0x0\n" + "mov x20, #0x0\n" "uadalp v31.4s, v1.8h\n" "movi v1.8h, #0x0\n" - "uadalp v30.4s, v0.8h\n" - "movi v0.8h, #0x0\n" - "mov x19, #0x0\n" "4:" // no_accumulate_16 - "ldr q28, [x27], #0x10\n" - "add x19, x19, #0x1\n" - "ldr q29, [x26], #0x10\n" + "ldr q30, [x28], #0x10\n" + "ldr q29, [x27], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q25, [x25], #0x10\n" - "zip1 v22.4s, v28.4s, v25.4s\n" - "ldr q21, [x24], #0x10\n" "cmp %x[width], #0x10\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "ldr q27, [x23], #0x10\n" - "ldr q26, [x22], #0x10\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "ldr q19, [x21], #0x10\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "ldr q24, [x20], #0x10\n" - "zip1 v23.4s, v22.4s, v20.4s\n" + "ldr q28, [x26], #0x10\n" + "ldr q27, [x25], #0x10\n" + "zip1 v22.4s, v30.4s, v28.4s\n" + "zip1 v21.4s, v29.4s, v27.4s\n" + "ldr q20, [x24], #0x10\n" + "ldr q26, [x23], #0x10\n" + "zip1 v25.4s, v22.4s, v21.4s\n" + "uadalp v2.8h, v25.16b\n" + "ldr q19, [x22], #0x10\n" + "ldr q18, [x21], #0x10\n" + "zip1 v17.4s, v20.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v18.4s\n" + "zip1 v24.4s, v17.4s, v16.4s\n" + "uadalp v1.8h, v24.16b\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "uadalp v1.8h, v23.16b\n" - "zip2 v22.4s, v22.4s, v20.4s\n" + "zip2 v23.4s, v22.4s, v21.4s\n" + "zip2 v22.4s, v17.4s, v16.4s\n" "prfm pldl1keep, [x26, #0x70]\n" - "uadalp v1.8h, v22.16b\n" - "zip1 v18.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v16.4s, v26.4s, v24.4s\n" + "zip2 v21.4s, v30.4s, v28.4s\n" + "zip2 v17.4s, v29.4s, v27.4s\n" "prfm pldl1keep, [x24, #0x70]\n" - "zip1 v21.4s, v28.4s, v25.4s\n" "prfm pldl1keep, [x23, #0x70]\n" - "uadalp v1.8h, v21.16b\n" - "zip1 v17.4s, v18.4s, v16.4s\n" + "zip2 v20.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v18.4s\n" "prfm pldl1keep, [x22, #0x70]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" "prfm pldl1keep, [x21, #0x70]\n" - "uadalp v0.8h, v17.16b\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x20, #0x70]\n" - "uadalp v0.8h, v20.16b\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v19.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "uadalp v0.8h, v18.16b\n" - "zip2 v17.4s, v28.4s, v25.4s\n" - "str q22, [%x[out_ptr], #0x20]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "str q20, [%x[out_ptr], #0x30]\n" - "uadalp v1.8h, v17.16b\n" - "str q21, [%x[out_ptr], #0x40]\n" - "uadalp v0.8h, v16.16b\n" + "uadalp v2.8h, v23.16b\n" + "uadalp v1.8h, v22.16b\n" + "str q25, [%x[out_ptr], #0x0]\n" + "add x20, x20, #0x1\n" + "zip1 v19.4s, v21.4s, v17.4s\n" + "zip1 v18.4s, v20.4s, v16.4s\n" + "str q24, [%x[out_ptr], #0x10]\n" + "uadalp v2.8h, v19.16b\n" + "uadalp v1.8h, v18.16b\n" + "str q23, [%x[out_ptr], #0x20]\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip2 v16.4s, v20.4s, v16.4s\n" + "str q22, [%x[out_ptr], #0x30]\n" + "str q19, [%x[out_ptr], #0x40]\n" + "uadalp v2.8h, v17.16b\n" + "uadalp v1.8h, v16.16b\n" "str q18, [%x[out_ptr], #0x50]\n" "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" @@ -153,216 +153,216 @@ void interleave_block<8, 4, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d28, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d26, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d24, [x20], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v28.s }[2], [x27], #0x4\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v26.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v24.s }[2], [x20], #0x4\n" + "ld1 { v30.s }[2], [x28], #0x4\n" + "ld1 { v29.s }[2], [x27], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v20.s }[2], [x24], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v28.h }[6], [x27], #0x2\n" - "mov x19, #0x4\n" - "ld1 { v29.h }[6], [x26], #0x2\n" - "ld1 { v25.h }[6], [x25], #0x2\n" - "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v27.h }[6], [x23], #0x2\n" - "ld1 { v26.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v24.h }[6], [x20], #0x2\n" + "ld1 { v30.h }[6], [x28], #0x2\n" + "ld1 { v29.h }[6], [x27], #0x2\n" + "mov x20, #0x4\n" + "ld1 { v28.h }[6], [x26], #0x2\n" + "ld1 { v27.h }[6], [x25], #0x2\n" + "ld1 { v20.h }[6], [x24], #0x2\n" + "ld1 { v26.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[14], [x27]\n" - "ld1 { v29.b }[14], [x26]\n" - "ld1 { v25.b }[14], [x25]\n" - "ld1 { v21.b }[14], [x24]\n" - "ld1 { v27.b }[14], [x23]\n" - "ld1 { v26.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" - "ld1 { v24.b }[14], [x20]\n" + "ld1 { v30.b }[14], [x28]\n" + "ld1 { v29.b }[14], [x27]\n" + "ld1 { v28.b }[14], [x26]\n" + "ld1 { v27.b }[14], [x25]\n" + "ld1 { v20.b }[14], [x24]\n" + "ld1 { v26.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v18.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 - "mov x19, #0x3\n" + "mov x20, #0x3\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[12], [x27]\n" - "ld1 { v29.b }[12], [x26]\n" - "mov x19, #0x4\n" - "ld1 { v25.b }[12], [x25]\n" - "ld1 { v21.b }[12], [x24]\n" - "ld1 { v27.b }[12], [x23]\n" - "ld1 { v26.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" - "ld1 { v24.b }[12], [x20]\n" + "ld1 { v30.b }[12], [x28]\n" + "ld1 { v29.b }[12], [x27]\n" + "mov x20, #0x4\n" + "ld1 { v28.b }[12], [x26]\n" + "ld1 { v27.b }[12], [x25]\n" + "ld1 { v20.b }[12], [x24]\n" + "ld1 { v26.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v18.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v28.h }[4], [x27], #0x2\n" - "ld1 { v29.h }[4], [x26], #0x2\n" - "mov x19, #0x3\n" - "ld1 { v25.h }[4], [x25], #0x2\n" - "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v27.h }[4], [x23], #0x2\n" - "ld1 { v26.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v24.h }[4], [x20], #0x2\n" + "ld1 { v30.h }[4], [x28], #0x2\n" + "ld1 { v29.h }[4], [x27], #0x2\n" + "mov x20, #0x3\n" + "ld1 { v28.h }[4], [x26], #0x2\n" + "ld1 { v27.h }[4], [x25], #0x2\n" + "ld1 { v20.h }[4], [x24], #0x2\n" + "ld1 { v26.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[10], [x27]\n" - "ld1 { v29.b }[10], [x26]\n" - "ld1 { v25.b }[10], [x25]\n" - "ld1 { v21.b }[10], [x24]\n" - "ld1 { v27.b }[10], [x23]\n" - "ld1 { v26.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" - "ld1 { v24.b }[10], [x20]\n" + "ld1 { v30.b }[10], [x28]\n" + "ld1 { v29.b }[10], [x27]\n" + "ld1 { v28.b }[10], [x26]\n" + "ld1 { v27.b }[10], [x25]\n" + "ld1 { v20.b }[10], [x24]\n" + "ld1 { v26.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v18.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[8], [x27]\n" - "ld1 { v29.b }[8], [x26]\n" - "mov x19, #0x3\n" - "ld1 { v25.b }[8], [x25]\n" - "ld1 { v21.b }[8], [x24]\n" - "ld1 { v27.b }[8], [x23]\n" - "ld1 { v26.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" - "ld1 { v24.b }[8], [x20]\n" + "ld1 { v30.b }[8], [x28]\n" + "ld1 { v29.b }[8], [x27]\n" + "mov x20, #0x3\n" + "ld1 { v28.b }[8], [x26]\n" + "ld1 { v27.b }[8], [x25]\n" + "ld1 { v20.b }[8], [x24]\n" + "ld1 { v26.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v18.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s28, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s26, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s24, [x20], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v28.h }[2], [x27], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "ld1 { v26.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v24.h }[2], [x20], #0x2\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "ld1 { v29.h }[2], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v20.h }[2], [x24], #0x2\n" + "ld1 { v26.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[6], [x27]\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v25.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v27.b }[6], [x23]\n" - "ld1 { v26.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" - "ld1 { v24.b }[6], [x20]\n" + "ld1 { v30.b }[6], [x28]\n" + "ld1 { v29.b }[6], [x27]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v20.b }[6], [x24]\n" + "ld1 { v26.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v18.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[4], [x27]\n" - "ld1 { v29.b }[4], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v27.b }[4], [x23]\n" - "ld1 { v26.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" - "ld1 { v24.b }[4], [x20]\n" + "ld1 { v30.b }[4], [x28]\n" + "ld1 { v29.b }[4], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v20.b }[4], [x24]\n" + "ld1 { v26.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v18.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h28, [x27], #0x2\n" - "ldr h29, [x26], #0x2\n" - "mov x19, #0x1\n" - "ldr h25, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h27, [x23], #0x2\n" - "ldr h26, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" - "ldr h24, [x20], #0x2\n" + "ldr h30, [x28], #0x2\n" + "ldr h29, [x27], #0x2\n" + "mov x20, #0x1\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h20, [x24], #0x2\n" + "ldr h26, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "ldr h18, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v28.b }[2], [x27]\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v25.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v27.b }[2], [x23]\n" - "ld1 { v26.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" - "ld1 { v24.b }[2], [x20]\n" + "ld1 { v30.b }[2], [x28]\n" + "ld1 { v29.b }[2], [x27]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v20.b }[2], [x24]\n" + "ld1 { v26.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v18.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b28, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b25, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b27, [x23, #0x0]\n" - "ldr b26, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" - "ldr b24, [x20, #0x0]\n" + "ldr b30, [x28, #0x0]\n" + "ldr b29, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b20, [x24, #0x0]\n" + "ldr b26, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "ldr b18, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v22.4s, v28.4s, v25.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v20.4s, v29.4s, v21.4s\n" - "zip1 v23.4s, v22.4s, v20.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - "uadalp v1.8h, v23.16b\n" - "zip1 v18.4s, v27.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v24.4s\n" - "zip1 v17.4s, v18.4s, v16.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" - "uadalp v0.8h, v17.16b\n" + "zip1 v22.4s, v30.4s, v28.4s\n" + "zip1 v21.4s, v29.4s, v27.4s\n" + "subs x20, x20, #0x1\n" + "zip1 v17.4s, v20.4s, v19.4s\n" + "zip1 v16.4s, v26.4s, v18.4s\n" + "zip1 v25.4s, v22.4s, v21.4s\n" + "zip1 v24.4s, v17.4s, v16.4s\n" + "str q25, [%x[out_ptr], #0x0]\n" + "uadalp v2.8h, v25.16b\n" + "str q24, [%x[out_ptr], #0x10]\n" + "uadalp v1.8h, v24.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v22.4s, v22.4s, v20.4s\n" - "str q22, [%x[out_ptr], #0x0]\n" - "zip2 v20.4s, v18.4s, v16.4s\n" + "zip2 v23.4s, v22.4s, v21.4s\n" + "zip2 v22.4s, v17.4s, v16.4s\n" + "subs x20, x20, #0x1\n" + "str q23, [%x[out_ptr], #0x0]\n" + "uadalp v2.8h, v23.16b\n" + "str q22, [%x[out_ptr], #0x10]\n" "uadalp v1.8h, v22.16b\n" - "str q20, [%x[out_ptr], #0x10]\n" - "subs x19, x19, #0x1\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - "uadalp v0.8h, v20.16b\n" "beq 14f\n" - "zip2 v28.4s, v28.4s, v25.4s\n" - "zip2 v25.4s, v29.4s, v21.4s\n" - "subs x19, x19, #0x1\n" - "zip1 v21.4s, v28.4s, v25.4s\n" - "str q21, [%x[out_ptr], #0x0]\n" - "uadalp v1.8h, v21.16b\n" - "zip2 v19.4s, v27.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v24.4s\n" - "zip1 v18.4s, v19.4s, v16.4s\n" + "zip2 v21.4s, v30.4s, v28.4s\n" + "zip2 v17.4s, v29.4s, v27.4s\n" + "subs x20, x20, #0x1\n" + "zip2 v20.4s, v20.4s, v19.4s\n" + "zip2 v16.4s, v26.4s, v18.4s\n" + "zip1 v19.4s, v21.4s, v17.4s\n" + "zip1 v18.4s, v20.4s, v16.4s\n" + "str q19, [%x[out_ptr], #0x0]\n" + "uadalp v2.8h, v19.16b\n" "str q18, [%x[out_ptr], #0x10]\n" - "uadalp v0.8h, v18.16b\n" + "uadalp v1.8h, v18.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v17.4s, v28.4s, v25.4s\n" + "zip2 v17.4s, v21.4s, v17.4s\n" + "zip2 v16.4s, v20.4s, v16.4s\n" "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v16.4s\n" - "uadalp v1.8h, v17.16b\n" + "uadalp v2.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x10]\n" + "uadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" - "uadalp v0.8h, v16.16b\n" "14:" // Odds skip + "uadalp v0.4s, v2.8h\n" "uadalp v31.4s, v1.8h\n" - "str q31, [%x[out_ptr], #0x0]\n" - "uadalp v30.4s, v0.8h\n" - "str q30, [%x[out_ptr], #0x10]\n" + "str q0, [%x[out_ptr], #0x0]\n" + "str q31, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp index 1330593cbf..4bfb36082e 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,44 +31,45 @@ void interleave_block<8, 8, VLType::None, false>( ) { __asm__ __volatile__( - "ldr x27, [%x[in], #0x0]\n" + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" "cmp %x[height], #0x8\n" - "ldr x26, [%x[in], #0x8]\n" + "add x28, x28, %x[row_offset]\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "add x27, x27, %x[row_offset]\n" - "ldr x25, [%x[in], #0x10]\n" - "ldr x24, [%x[in], #0x18]\n" "add x26, x26, %x[row_offset]\n" - "ldr x23, [%x[in], #0x20]\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "add x25, x25, %x[row_offset]\n" - "ldr x22, [%x[in], #0x28]\n" - "ldr x21, [%x[in], #0x30]\n" "add x24, x24, %x[row_offset]\n" - "ldr x20, [%x[in], #0x38]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "prfm pldl1keep, [x27, #0x0]\n" "cmp %x[width], #0x10\n" + "prfm pldl1keep, [x28, #0x0]\n" + "prfm pldl1keep, [x27, #0x0]\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" "prfm pldl1keep, [x24, #0x0]\n" "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -76,230 +77,229 @@ void interleave_block<8, 8, VLType::None, false>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q27, [x27], #0x10\n" + "ldr q26, [x28], #0x10\n" + "ldr q21, [x27], #0x10\n" "subs %x[width], %x[width], #0x10\n" - "ldr q24, [x26], #0x10\n" - "zip1 v26.2d, v27.2d, v24.2d\n" - "ldr q25, [x25], #0x10\n" "cmp %x[width], #0x10\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "ldr q21, [x24], #0x10\n" - "ldr q23, [x23], #0x10\n" - "zip1 v22.2d, v25.2d, v21.2d\n" - "ldr q18, [x22], #0x10\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "ldr q20, [x21], #0x10\n" - "ldr q16, [x20], #0x10\n" - "zip1 v19.2d, v23.2d, v18.2d\n" + "ldr q25, [x26], #0x10\n" + "ldr q24, [x25], #0x10\n" + "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v18.2d, v25.2d, v24.2d\n" + "ldr q23, [x24], #0x10\n" + "ldr q22, [x23], #0x10\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip2 v21.2d, v26.2d, v21.2d\n" + "ldr q20, [x22], #0x10\n" + "ldr q19, [x21], #0x10\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v16.2d, v20.2d, v19.2d\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v18.2d, v23.2d, v18.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v18.2d, v25.2d, v24.2d\n" "prfm pldl1keep, [x26, #0x70]\n" - "zip1 v17.2d, v20.2d, v16.2d\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v16.2d, v20.2d, v16.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" + "zip2 v17.2d, v23.2d, v22.2d\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" + "str q16, [%x[out_ptr], #0x30]\n" + "zip2 v16.2d, v20.2d, v19.2d\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q26, [%x[out_ptr], #0x0]\n" - "str q22, [%x[out_ptr], #0x10]\n" - "str q19, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" - "str q24, [%x[out_ptr], #0x40]\n" - "str q21, [%x[out_ptr], #0x50]\n" - "str q18, [%x[out_ptr], #0x60]\n" + "str q21, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 2b\n" "3:" // Main loop skip "cbz %x[width], 12f\n" "tbz %x[width], #3, 7f\n" - "ldr d27, [x27], #0x8\n" - "ldr d24, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d23, [x23], #0x8\n" - "ldr d18, [x22], #0x8\n" - "ldr d20, [x21], #0x8\n" - "ldr d16, [x20], #0x8\n" + "ldr d26, [x28], #0x8\n" + "ldr d21, [x27], #0x8\n" + "ldr d25, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "ldr d20, [x22], #0x8\n" + "ldr d19, [x21], #0x8\n" "tbz %x[width], #2, 5f\n" - "ld1 { v27.s }[2], [x27], #0x4\n" - "ld1 { v24.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v23.s }[2], [x23], #0x4\n" - "ld1 { v18.s }[2], [x22], #0x4\n" - "ld1 { v20.s }[2], [x21], #0x4\n" - "ld1 { v16.s }[2], [x20], #0x4\n" + "ld1 { v26.s }[2], [x28], #0x4\n" + "ld1 { v21.s }[2], [x27], #0x4\n" + "ld1 { v25.s }[2], [x26], #0x4\n" + "ld1 { v24.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "ld1 { v20.s }[2], [x22], #0x4\n" + "ld1 { v19.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v27.h }[6], [x27], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v24.h }[6], [x26], #0x2\n" - "ld1 { v25.h }[6], [x25], #0x2\n" - "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v23.h }[6], [x23], #0x2\n" - "ld1 { v18.h }[6], [x22], #0x2\n" - "ld1 { v20.h }[6], [x21], #0x2\n" - "ld1 { v16.h }[6], [x20], #0x2\n" + "ld1 { v26.h }[6], [x28], #0x2\n" + "ld1 { v21.h }[6], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[6], [x26], #0x2\n" + "ld1 { v24.h }[6], [x25], #0x2\n" + "ld1 { v23.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "ld1 { v20.h }[6], [x22], #0x2\n" + "ld1 { v19.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[14], [x27]\n" - "ld1 { v24.b }[14], [x26]\n" - "ld1 { v25.b }[14], [x25]\n" - "ld1 { v21.b }[14], [x24]\n" - "ld1 { v23.b }[14], [x23]\n" - "ld1 { v18.b }[14], [x22]\n" - "ld1 { v20.b }[14], [x21]\n" - "ld1 { v16.b }[14], [x20]\n" + "ld1 { v26.b }[14], [x28]\n" + "ld1 { v21.b }[14], [x27]\n" + "ld1 { v25.b }[14], [x26]\n" + "ld1 { v24.b }[14], [x25]\n" + "ld1 { v23.b }[14], [x24]\n" + "ld1 { v22.b }[14], [x23]\n" + "ld1 { v20.b }[14], [x22]\n" + "ld1 { v19.b }[14], [x21]\n" "b 11f\n" "4:" // odd_loads_1_12 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[12], [x27]\n" - "ld1 { v24.b }[12], [x26]\n" - "ld1 { v25.b }[12], [x25]\n" - "ld1 { v21.b }[12], [x24]\n" - "ld1 { v23.b }[12], [x23]\n" - "ld1 { v18.b }[12], [x22]\n" - "ld1 { v20.b }[12], [x21]\n" - "ld1 { v16.b }[12], [x20]\n" + "ld1 { v26.b }[12], [x28]\n" + "ld1 { v21.b }[12], [x27]\n" + "ld1 { v25.b }[12], [x26]\n" + "ld1 { v24.b }[12], [x25]\n" + "ld1 { v23.b }[12], [x24]\n" + "ld1 { v22.b }[12], [x23]\n" + "ld1 { v20.b }[12], [x22]\n" + "ld1 { v19.b }[12], [x21]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" - "ld1 { v27.h }[4], [x27], #0x2\n" - "ld1 { v24.h }[4], [x26], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v25.h }[4], [x25], #0x2\n" - "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v23.h }[4], [x23], #0x2\n" - "ld1 { v18.h }[4], [x22], #0x2\n" - "ld1 { v20.h }[4], [x21], #0x2\n" - "ld1 { v16.h }[4], [x20], #0x2\n" + "ld1 { v26.h }[4], [x28], #0x2\n" + "ld1 { v21.h }[4], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[4], [x26], #0x2\n" + "ld1 { v24.h }[4], [x25], #0x2\n" + "ld1 { v23.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "ld1 { v20.h }[4], [x22], #0x2\n" + "ld1 { v19.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[10], [x27]\n" - "ld1 { v24.b }[10], [x26]\n" - "ld1 { v25.b }[10], [x25]\n" - "ld1 { v21.b }[10], [x24]\n" - "ld1 { v23.b }[10], [x23]\n" - "ld1 { v18.b }[10], [x22]\n" - "ld1 { v20.b }[10], [x21]\n" - "ld1 { v16.b }[10], [x20]\n" + "ld1 { v26.b }[10], [x28]\n" + "ld1 { v21.b }[10], [x27]\n" + "ld1 { v25.b }[10], [x26]\n" + "ld1 { v24.b }[10], [x25]\n" + "ld1 { v23.b }[10], [x24]\n" + "ld1 { v22.b }[10], [x23]\n" + "ld1 { v20.b }[10], [x22]\n" + "ld1 { v19.b }[10], [x21]\n" "b 11f\n" "6:" // odd_loads_1_8 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[8], [x27]\n" - "ld1 { v24.b }[8], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.b }[8], [x25]\n" - "ld1 { v21.b }[8], [x24]\n" - "ld1 { v23.b }[8], [x23]\n" - "ld1 { v18.b }[8], [x22]\n" - "ld1 { v20.b }[8], [x21]\n" - "ld1 { v16.b }[8], [x20]\n" + "ld1 { v26.b }[8], [x28]\n" + "ld1 { v21.b }[8], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v25.b }[8], [x26]\n" + "ld1 { v24.b }[8], [x25]\n" + "ld1 { v23.b }[8], [x24]\n" + "ld1 { v22.b }[8], [x23]\n" + "ld1 { v20.b }[8], [x22]\n" + "ld1 { v19.b }[8], [x21]\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" - "ldr s27, [x27], #0x4\n" - "ldr s24, [x26], #0x4\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s23, [x23], #0x4\n" - "ldr s18, [x22], #0x4\n" - "ldr s20, [x21], #0x4\n" - "ldr s16, [x20], #0x4\n" + "ldr s26, [x28], #0x4\n" + "ldr s21, [x27], #0x4\n" + "ldr s25, [x26], #0x4\n" + "ldr s24, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s19, [x21], #0x4\n" "tbz %x[width], #1, 8f\n" - "ld1 { v27.h }[2], [x27], #0x2\n" - "mov x19, #0x1\n" - "ld1 { v24.h }[2], [x26], #0x2\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v18.h }[2], [x22], #0x2\n" - "ld1 { v20.h }[2], [x21], #0x2\n" - "ld1 { v16.h }[2], [x20], #0x2\n" + "ld1 { v26.h }[2], [x28], #0x2\n" + "ld1 { v21.h }[2], [x27], #0x2\n" + "mov x20, #0x1\n" + "ld1 { v25.h }[2], [x26], #0x2\n" + "ld1 { v24.h }[2], [x25], #0x2\n" + "ld1 { v23.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "ld1 { v20.h }[2], [x22], #0x2\n" + "ld1 { v19.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[6], [x27]\n" - "ld1 { v24.b }[6], [x26]\n" - "ld1 { v25.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v18.b }[6], [x22]\n" - "ld1 { v20.b }[6], [x21]\n" - "ld1 { v16.b }[6], [x20]\n" + "ld1 { v26.b }[6], [x28]\n" + "ld1 { v21.b }[6], [x27]\n" + "ld1 { v25.b }[6], [x26]\n" + "ld1 { v24.b }[6], [x25]\n" + "ld1 { v23.b }[6], [x24]\n" + "ld1 { v22.b }[6], [x23]\n" + "ld1 { v20.b }[6], [x22]\n" + "ld1 { v19.b }[6], [x21]\n" "b 11f\n" "8:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[4], [x27]\n" - "ld1 { v24.b }[4], [x26]\n" - "ld1 { v25.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v18.b }[4], [x22]\n" - "ld1 { v20.b }[4], [x21]\n" - "ld1 { v16.b }[4], [x20]\n" + "ld1 { v26.b }[4], [x28]\n" + "ld1 { v21.b }[4], [x27]\n" + "ld1 { v25.b }[4], [x26]\n" + "ld1 { v24.b }[4], [x25]\n" + "ld1 { v23.b }[4], [x24]\n" + "ld1 { v22.b }[4], [x23]\n" + "ld1 { v20.b }[4], [x22]\n" + "ld1 { v19.b }[4], [x21]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" - "ldr h27, [x27], #0x2\n" - "ldr h24, [x26], #0x2\n" - "mov x19, #0x1\n" - "ldr h25, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h23, [x23], #0x2\n" - "ldr h18, [x22], #0x2\n" - "ldr h20, [x21], #0x2\n" - "ldr h16, [x20], #0x2\n" + "ldr h26, [x28], #0x2\n" + "ldr h21, [x27], #0x2\n" + "mov x20, #0x1\n" + "ldr h25, [x26], #0x2\n" + "ldr h24, [x25], #0x2\n" + "ldr h23, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "ldr h20, [x22], #0x2\n" + "ldr h19, [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v27.b }[2], [x27]\n" - "ld1 { v24.b }[2], [x26]\n" - "ld1 { v25.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v18.b }[2], [x22]\n" - "ld1 { v20.b }[2], [x21]\n" - "ld1 { v16.b }[2], [x20]\n" + "ld1 { v26.b }[2], [x28]\n" + "ld1 { v21.b }[2], [x27]\n" + "ld1 { v25.b }[2], [x26]\n" + "ld1 { v24.b }[2], [x25]\n" + "ld1 { v23.b }[2], [x24]\n" + "ld1 { v22.b }[2], [x23]\n" + "ld1 { v20.b }[2], [x22]\n" + "ld1 { v19.b }[2], [x21]\n" "b 11f\n" "10:" // odd_loads_1_0 - "ldr b27, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b24, [x26, #0x0]\n" - "ldr b25, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b23, [x23, #0x0]\n" - "ldr b18, [x22, #0x0]\n" - "ldr b20, [x21, #0x0]\n" - "ldr b16, [x20, #0x0]\n" + "ldr b26, [x28, #0x0]\n" + "ldr b21, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b25, [x26, #0x0]\n" + "ldr b24, [x25, #0x0]\n" + "ldr b23, [x24, #0x0]\n" + "ldr b22, [x23, #0x0]\n" + "ldr b20, [x22, #0x0]\n" + "ldr b19, [x21, #0x0]\n" "11:" // Odd load end - "zip1 v26.2d, v27.2d, v24.2d\n" - "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v22.2d, v25.2d, v21.2d\n" - "subs x19, x19, #0x1\n" - "zip1 v19.2d, v23.2d, v18.2d\n" - "str q22, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v20.2d, v16.2d\n" - "str q19, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" + "subs x20, x20, #0x1\n" + "zip1 v16.2d, v26.2d, v21.2d\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip1 v18.2d, v25.2d, v24.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip1 v16.2d, v20.2d, v19.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 12f\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "zip2 v18.2d, v23.2d, v18.2d\n" - "str q21, [%x[out_ptr], #0x10]\n" - "zip2 v16.2d, v20.2d, v16.2d\n" - "str q18, [%x[out_ptr], #0x20]\n" + "zip2 v21.2d, v26.2d, v21.2d\n" + "str q21, [%x[out_ptr], #0x0]\n" + "zip2 v18.2d, v25.2d, v24.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "zip2 v17.2d, v23.2d, v22.2d\n" + "zip2 v16.2d, v20.2d, v19.2d\n" + "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "12:" // Odds skip : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp index 3550830fc3..c6ad2949f5 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,46 +31,47 @@ void interleave_block<8, 8, VLType::None, true>( ) { __asm__ __volatile__( + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "movi v5.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" "movi v4.8h, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" - "cmp %x[height], #0x8\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "movi v3.8h, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" - "add x27, x27, %x[row_offset]\n" "movi v2.8h, #0x0\n" - "ldr x24, [%x[in], #0x18]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "movi v1.4s, #0x0\n" - "ldr x23, [%x[in], #0x20]\n" - "add x26, x26, %x[row_offset]\n" "movi v0.4s, #0x0\n" - "ldr x22, [%x[in], #0x28]\n" - "add x25, x25, %x[row_offset]\n" "movi v31.4s, #0x0\n" - "ldr x21, [%x[in], #0x30]\n" - "add x24, x24, %x[row_offset]\n" "movi v30.4s, #0x0\n" - "ldr x20, [%x[in], #0x38]\n" + "add x28, x28, %x[row_offset]\n" + "add x27, x27, %x[row_offset]\n" + "add x26, x26, %x[row_offset]\n" + "add x25, x25, %x[row_offset]\n" + "add x24, x24, %x[row_offset]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "movi v29.4s, #0x0\n" + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" + "movi v29.4s, #0x0\n" "movi v28.4s, #0x0\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -78,7 +79,7 @@ void interleave_block<8, 8, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -86,7 +87,6 @@ void interleave_block<8, 8, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" "ld1 { v29.4s }, [%x[out_ptr]]\n" @@ -95,266 +95,266 @@ void interleave_block<8, 8, VLType::None, true>( "cmp %x[width], #0x10\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0x3e\n" + "cmp x20, #0x3e\n" "ble 4f\n" "sadalp v1.4s, v5.8h\n" "movi v5.8h, #0x0\n" + "mov x20, #0x0\n" "sadalp v0.4s, v4.8h\n" "movi v4.8h, #0x0\n" "sadalp v31.4s, v3.8h\n" "movi v3.8h, #0x0\n" "sadalp v30.4s, v2.8h\n" "movi v2.8h, #0x0\n" - "mov x19, #0x0\n" "4:" // no_accumulate_16 - "ldr q27, [x27], #0x10\n" - "add x19, x19, #0x1\n" - "ldr q24, [x26], #0x10\n" - "zip1 v26.2d, v27.2d, v24.2d\n" - "ldr q25, [x25], #0x10\n" - "subs %x[width], %x[width], #0x10\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "ldr q21, [x24], #0x10\n" + "ldr q27, [x28], #0x10\n" + "ldr q19, [x27], #0x10\n" + "zip1 v26.2d, v27.2d, v19.2d\n" "sadalp v5.8h, v26.16b\n" - "zip1 v23.2d, v25.2d, v21.2d\n" - "ldr q22, [x23], #0x10\n" + "ldr q25, [x26], #0x10\n" + "ldr q18, [x25], #0x10\n" + "zip1 v24.2d, v25.2d, v18.2d\n" + "sadalp v4.8h, v24.16b\n" + "ldr q23, [x24], #0x10\n" + "ldr q17, [x23], #0x10\n" + "zip1 v22.2d, v23.2d, v17.2d\n" + "sadalp v3.8h, v22.16b\n" + "ldr q21, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v20.2d, v21.2d, v16.2d\n" + "sadalp v2.8h, v20.16b\n" + "zip2 v19.2d, v27.2d, v19.2d\n" + "zip2 v18.2d, v25.2d, v18.2d\n" + "subs %x[width], %x[width], #0x10\n" "cmp %x[width], #0x10\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "ldr q18, [x22], #0x10\n" - "sadalp v4.8h, v23.16b\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "ldr q19, [x21], #0x10\n" - "sadalp v5.8h, v24.16b\n" - "zip2 v18.2d, v22.2d, v18.2d\n" - "ldr q16, [x20], #0x10\n" - "sadalp v3.8h, v20.16b\n" - "zip1 v17.2d, v19.2d, v16.2d\n" + "zip2 v17.2d, v23.2d, v17.2d\n" + "zip2 v16.2d, v21.2d, v16.2d\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "sadalp v4.8h, v21.16b\n" - "zip2 v16.2d, v19.2d, v16.2d\n" "prfm pldl1keep, [x26, #0x70]\n" - "sadalp v2.8h, v17.16b\n" "prfm pldl1keep, [x25, #0x70]\n" - "sadalp v3.8h, v18.16b\n" + "str q26, [%x[out_ptr], #0x0]\n" + "sadalp v5.8h, v19.16b\n" "prfm pldl1keep, [x24, #0x70]\n" - "sadalp v2.8h, v16.16b\n" "prfm pldl1keep, [x23, #0x70]\n" + "str q24, [%x[out_ptr], #0x10]\n" + "sadalp v4.8h, v18.16b\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q26, [%x[out_ptr], #0x0]\n" - "str q23, [%x[out_ptr], #0x10]\n" - "str q20, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" - "str q24, [%x[out_ptr], #0x40]\n" - "str q21, [%x[out_ptr], #0x50]\n" - "str q18, [%x[out_ptr], #0x60]\n" + "str q22, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v17.16b\n" + "str q20, [%x[out_ptr], #0x30]\n" + "sadalp v2.8h, v16.16b\n" + "add x20, x20, #0x1\n" + "str q19, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d27, [x27], #0x8\n" - "ldr d24, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" - "ldr d18, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d16, [x20], #0x8\n" + "ldr d27, [x28], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d25, [x26], #0x8\n" + "ldr d18, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v27.s }[2], [x27], #0x4\n" - "ld1 { v24.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v22.s }[2], [x23], #0x4\n" - "ld1 { v18.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v16.s }[2], [x20], #0x4\n" + "ld1 { v27.s }[2], [x28], #0x4\n" + "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v25.s }[2], [x26], #0x4\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v16.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v27.h }[6], [x27], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v24.h }[6], [x26], #0x2\n" - "ld1 { v25.h }[6], [x25], #0x2\n" - "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v22.h }[6], [x23], #0x2\n" - "ld1 { v18.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v16.h }[6], [x20], #0x2\n" + "ld1 { v27.h }[6], [x28], #0x2\n" + "ld1 { v19.h }[6], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[6], [x26], #0x2\n" + "ld1 { v18.h }[6], [x25], #0x2\n" + "ld1 { v23.h }[6], [x24], #0x2\n" + "ld1 { v17.h }[6], [x23], #0x2\n" + "ld1 { v21.h }[6], [x22], #0x2\n" + "ld1 { v16.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[14], [x27]\n" - "ld1 { v24.b }[14], [x26]\n" - "ld1 { v25.b }[14], [x25]\n" - "ld1 { v21.b }[14], [x24]\n" - "ld1 { v22.b }[14], [x23]\n" - "ld1 { v18.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" - "ld1 { v16.b }[14], [x20]\n" + "ld1 { v27.b }[14], [x28]\n" + "ld1 { v19.b }[14], [x27]\n" + "ld1 { v25.b }[14], [x26]\n" + "ld1 { v18.b }[14], [x25]\n" + "ld1 { v23.b }[14], [x24]\n" + "ld1 { v17.b }[14], [x23]\n" + "ld1 { v21.b }[14], [x22]\n" + "ld1 { v16.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[12], [x27]\n" - "ld1 { v24.b }[12], [x26]\n" - "ld1 { v25.b }[12], [x25]\n" - "ld1 { v21.b }[12], [x24]\n" - "ld1 { v22.b }[12], [x23]\n" - "ld1 { v18.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" - "ld1 { v16.b }[12], [x20]\n" + "ld1 { v27.b }[12], [x28]\n" + "ld1 { v19.b }[12], [x27]\n" + "ld1 { v25.b }[12], [x26]\n" + "ld1 { v18.b }[12], [x25]\n" + "ld1 { v23.b }[12], [x24]\n" + "ld1 { v17.b }[12], [x23]\n" + "ld1 { v21.b }[12], [x22]\n" + "ld1 { v16.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v27.h }[4], [x27], #0x2\n" - "ld1 { v24.h }[4], [x26], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v25.h }[4], [x25], #0x2\n" - "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v22.h }[4], [x23], #0x2\n" - "ld1 { v18.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v16.h }[4], [x20], #0x2\n" + "ld1 { v27.h }[4], [x28], #0x2\n" + "ld1 { v19.h }[4], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[4], [x26], #0x2\n" + "ld1 { v18.h }[4], [x25], #0x2\n" + "ld1 { v23.h }[4], [x24], #0x2\n" + "ld1 { v17.h }[4], [x23], #0x2\n" + "ld1 { v21.h }[4], [x22], #0x2\n" + "ld1 { v16.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[10], [x27]\n" - "ld1 { v24.b }[10], [x26]\n" - "ld1 { v25.b }[10], [x25]\n" - "ld1 { v21.b }[10], [x24]\n" - "ld1 { v22.b }[10], [x23]\n" - "ld1 { v18.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" - "ld1 { v16.b }[10], [x20]\n" + "ld1 { v27.b }[10], [x28]\n" + "ld1 { v19.b }[10], [x27]\n" + "ld1 { v25.b }[10], [x26]\n" + "ld1 { v18.b }[10], [x25]\n" + "ld1 { v23.b }[10], [x24]\n" + "ld1 { v17.b }[10], [x23]\n" + "ld1 { v21.b }[10], [x22]\n" + "ld1 { v16.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[8], [x27]\n" - "ld1 { v24.b }[8], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.b }[8], [x25]\n" - "ld1 { v21.b }[8], [x24]\n" - "ld1 { v22.b }[8], [x23]\n" - "ld1 { v18.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" - "ld1 { v16.b }[8], [x20]\n" + "ld1 { v27.b }[8], [x28]\n" + "ld1 { v19.b }[8], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v25.b }[8], [x26]\n" + "ld1 { v18.b }[8], [x25]\n" + "ld1 { v23.b }[8], [x24]\n" + "ld1 { v17.b }[8], [x23]\n" + "ld1 { v21.b }[8], [x22]\n" + "ld1 { v16.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s27, [x27], #0x4\n" - "ldr s24, [x26], #0x4\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s22, [x23], #0x4\n" - "ldr s18, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s16, [x20], #0x4\n" + "ldr s27, [x28], #0x4\n" + "ldr s19, [x27], #0x4\n" + "ldr s25, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "ldr s17, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s16, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v27.h }[2], [x27], #0x2\n" - "mov x19, #0x1\n" - "ld1 { v24.h }[2], [x26], #0x2\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v22.h }[2], [x23], #0x2\n" - "ld1 { v18.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v16.h }[2], [x20], #0x2\n" + "ld1 { v27.h }[2], [x28], #0x2\n" + "ld1 { v19.h }[2], [x27], #0x2\n" + "mov x20, #0x1\n" + "ld1 { v25.h }[2], [x26], #0x2\n" + "ld1 { v18.h }[2], [x25], #0x2\n" + "ld1 { v23.h }[2], [x24], #0x2\n" + "ld1 { v17.h }[2], [x23], #0x2\n" + "ld1 { v21.h }[2], [x22], #0x2\n" + "ld1 { v16.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[6], [x27]\n" - "ld1 { v24.b }[6], [x26]\n" - "ld1 { v25.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v22.b }[6], [x23]\n" - "ld1 { v18.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" - "ld1 { v16.b }[6], [x20]\n" + "ld1 { v27.b }[6], [x28]\n" + "ld1 { v19.b }[6], [x27]\n" + "ld1 { v25.b }[6], [x26]\n" + "ld1 { v18.b }[6], [x25]\n" + "ld1 { v23.b }[6], [x24]\n" + "ld1 { v17.b }[6], [x23]\n" + "ld1 { v21.b }[6], [x22]\n" + "ld1 { v16.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[4], [x27]\n" - "ld1 { v24.b }[4], [x26]\n" - "ld1 { v25.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v22.b }[4], [x23]\n" - "ld1 { v18.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" - "ld1 { v16.b }[4], [x20]\n" + "ld1 { v27.b }[4], [x28]\n" + "ld1 { v19.b }[4], [x27]\n" + "ld1 { v25.b }[4], [x26]\n" + "ld1 { v18.b }[4], [x25]\n" + "ld1 { v23.b }[4], [x24]\n" + "ld1 { v17.b }[4], [x23]\n" + "ld1 { v21.b }[4], [x22]\n" + "ld1 { v16.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h27, [x27], #0x2\n" - "ldr h24, [x26], #0x2\n" - "mov x19, #0x1\n" - "ldr h25, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h22, [x23], #0x2\n" - "ldr h18, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" - "ldr h16, [x20], #0x2\n" + "ldr h27, [x28], #0x2\n" + "ldr h19, [x27], #0x2\n" + "mov x20, #0x1\n" + "ldr h25, [x26], #0x2\n" + "ldr h18, [x25], #0x2\n" + "ldr h23, [x24], #0x2\n" + "ldr h17, [x23], #0x2\n" + "ldr h21, [x22], #0x2\n" + "ldr h16, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[2], [x27]\n" - "ld1 { v24.b }[2], [x26]\n" - "ld1 { v25.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v22.b }[2], [x23]\n" - "ld1 { v18.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" - "ld1 { v16.b }[2], [x20]\n" + "ld1 { v27.b }[2], [x28]\n" + "ld1 { v19.b }[2], [x27]\n" + "ld1 { v25.b }[2], [x26]\n" + "ld1 { v18.b }[2], [x25]\n" + "ld1 { v23.b }[2], [x24]\n" + "ld1 { v17.b }[2], [x23]\n" + "ld1 { v21.b }[2], [x22]\n" + "ld1 { v16.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b27, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b24, [x26, #0x0]\n" - "ldr b25, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b22, [x23, #0x0]\n" - "ldr b18, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" - "ldr b16, [x20, #0x0]\n" + "ldr b27, [x28, #0x0]\n" + "ldr b19, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b25, [x26, #0x0]\n" + "ldr b18, [x25, #0x0]\n" + "ldr b23, [x24, #0x0]\n" + "ldr b17, [x23, #0x0]\n" + "ldr b21, [x22, #0x0]\n" + "ldr b16, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v26.2d, v27.2d, v24.2d\n" + "zip1 v26.2d, v27.2d, v19.2d\n" + "zip1 v24.2d, v25.2d, v18.2d\n" + "subs x20, x20, #0x1\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v23.2d, v25.2d, v21.2d\n" + "zip1 v22.2d, v23.2d, v17.2d\n" + "zip1 v20.2d, v21.2d, v16.2d\n" + "str q24, [%x[out_ptr], #0x10]\n" "sadalp v5.8h, v26.16b\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "sadalp v4.8h, v23.16b\n" - "zip1 v17.2d, v19.2d, v16.2d\n" - "str q20, [%x[out_ptr], #0x20]\n" - "sadalp v3.8h, v20.16b\n" - "str q17, [%x[out_ptr], #0x30]\n" - "sadalp v2.8h, v17.16b\n" - "subs x19, x19, #0x1\n" + "sadalp v4.8h, v24.16b\n" + "str q22, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v22.16b\n" + "str q20, [%x[out_ptr], #0x30]\n" + "sadalp v2.8h, v20.16b\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 14f\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "sadalp v5.8h, v24.16b\n" - "zip2 v18.2d, v22.2d, v18.2d\n" - "str q21, [%x[out_ptr], #0x10]\n" - "sadalp v4.8h, v21.16b\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "str q18, [%x[out_ptr], #0x20]\n" - "sadalp v3.8h, v18.16b\n" + "zip2 v19.2d, v27.2d, v19.2d\n" + "zip2 v18.2d, v25.2d, v18.2d\n" + "str q19, [%x[out_ptr], #0x0]\n" + "zip2 v17.2d, v23.2d, v17.2d\n" + "zip2 v16.2d, v21.2d, v16.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "sadalp v5.8h, v19.16b\n" + "sadalp v4.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x30]\n" "sadalp v2.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "14:" // Odds skip "sadalp v1.4s, v5.8h\n" "sadalp v0.4s, v4.8h\n" - "addp v1.4s, v1.4s, v0.4s\n" "sadalp v31.4s, v3.8h\n" "sadalp v30.4s, v2.8h\n" - "add v1.4s, v1.4s, v29.4s\n" - "str q1, [%x[out_ptr], #0x0]\n" + "addp v1.4s, v1.4s, v0.4s\n" "addp v0.4s, v31.4s, v30.4s\n" + "add v1.4s, v1.4s, v29.4s\n" "add v0.4s, v0.4s, v28.4s\n" + "str q1, [%x[out_ptr], #0x0]\n" "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp index 454260ef1a..6c4a5fa62b 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #ifdef __aarch64__ @@ -31,46 +31,47 @@ void interleave_block<8, 8, VLType::None, true>( ) { __asm__ __volatile__( + "ldr x28, [%x[in], #0x0]\n" + "ldr x27, [%x[in], #0x8]\n" + "cmp %x[height], #0x8\n" + "mov x20, #0x0\n" + "ldr x26, [%x[in], #0x10]\n" + "ldr x25, [%x[in], #0x18]\n" "movi v5.8h, #0x0\n" - "ldr x27, [%x[in], #0x0]\n" - "mov x19, #0x0\n" "movi v4.8h, #0x0\n" - "ldr x26, [%x[in], #0x8]\n" - "cmp %x[height], #0x8\n" + "ldr x24, [%x[in], #0x20]\n" + "ldr x23, [%x[in], #0x28]\n" "movi v3.8h, #0x0\n" - "ldr x25, [%x[in], #0x10]\n" - "add x27, x27, %x[row_offset]\n" "movi v2.8h, #0x0\n" - "ldr x24, [%x[in], #0x18]\n" + "ldr x22, [%x[in], #0x30]\n" + "ldr x21, [%x[in], #0x38]\n" "movi v1.4s, #0x0\n" - "ldr x23, [%x[in], #0x20]\n" - "add x26, x26, %x[row_offset]\n" "movi v0.4s, #0x0\n" - "ldr x22, [%x[in], #0x28]\n" - "add x25, x25, %x[row_offset]\n" "movi v31.4s, #0x0\n" - "ldr x21, [%x[in], #0x30]\n" - "add x24, x24, %x[row_offset]\n" "movi v30.4s, #0x0\n" - "ldr x20, [%x[in], #0x38]\n" + "add x28, x28, %x[row_offset]\n" + "add x27, x27, %x[row_offset]\n" + "add x26, x26, %x[row_offset]\n" + "add x25, x25, %x[row_offset]\n" + "add x24, x24, %x[row_offset]\n" "add x23, x23, %x[row_offset]\n" "add x22, x22, %x[row_offset]\n" "add x21, x21, %x[row_offset]\n" - "add x20, x20, %x[row_offset]\n" "beq 1f\n" - "mov x20, x27\n" "cmp %x[height], #0x2\n" - "csel x26, x26, x27, GE\n" - "csel x25, x25, x27, GT\n" + "csel x27, x27, x28, GE\n" + "csel x26, x26, x28, GT\n" "cmp %x[height], #0x4\n" - "csel x24, x24, x27, GE\n" - "csel x23, x23, x27, GT\n" + "csel x25, x25, x28, GE\n" + "csel x24, x24, x28, GT\n" "cmp %x[height], #0x6\n" - "csel x22, x22, x27, GE\n" - "csel x21, x21, x27, GT\n" + "mov x21, x28\n" + "csel x23, x23, x28, GE\n" + "csel x22, x22, x28, GT\n" "1:" // no_pointer_adj - "movi v29.4s, #0x0\n" + "prfm pldl1keep, [x28, #0x0]\n" "prfm pldl1keep, [x27, #0x0]\n" + "movi v29.4s, #0x0\n" "movi v28.4s, #0x0\n" "prfm pldl1keep, [x26, #0x0]\n" "prfm pldl1keep, [x25, #0x0]\n" @@ -78,7 +79,7 @@ void interleave_block<8, 8, VLType::None, true>( "prfm pldl1keep, [x23, #0x0]\n" "prfm pldl1keep, [x22, #0x0]\n" "prfm pldl1keep, [x21, #0x0]\n" - "prfm pldl1keep, [x20, #0x0]\n" + "prfm pldl1keep, [x28, #0x40]\n" "prfm pldl1keep, [x27, #0x40]\n" "prfm pldl1keep, [x26, #0x40]\n" "prfm pldl1keep, [x25, #0x40]\n" @@ -86,7 +87,6 @@ void interleave_block<8, 8, VLType::None, true>( "prfm pldl1keep, [x23, #0x40]\n" "prfm pldl1keep, [x22, #0x40]\n" "prfm pldl1keep, [x21, #0x40]\n" - "prfm pldl1keep, [x20, #0x40]\n" "cbnz %w[first], 2f\n" "sub %x[out_ptr], %x[out_ptr], #0x20\n" "ld1 { v29.4s }, [%x[out_ptr]]\n" @@ -95,266 +95,266 @@ void interleave_block<8, 8, VLType::None, true>( "cmp %x[width], #0x10\n" "blt 5f\n" "3:" // Main loop head - "cmp x19, #0x3e\n" + "cmp x20, #0x3e\n" "ble 4f\n" "uadalp v1.4s, v5.8h\n" "movi v5.8h, #0x0\n" + "mov x20, #0x0\n" "uadalp v0.4s, v4.8h\n" "movi v4.8h, #0x0\n" "uadalp v31.4s, v3.8h\n" "movi v3.8h, #0x0\n" "uadalp v30.4s, v2.8h\n" "movi v2.8h, #0x0\n" - "mov x19, #0x0\n" "4:" // no_accumulate_16 - "ldr q27, [x27], #0x10\n" - "add x19, x19, #0x1\n" - "ldr q24, [x26], #0x10\n" - "zip1 v26.2d, v27.2d, v24.2d\n" - "ldr q25, [x25], #0x10\n" - "subs %x[width], %x[width], #0x10\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "ldr q21, [x24], #0x10\n" + "ldr q27, [x28], #0x10\n" + "ldr q19, [x27], #0x10\n" + "zip1 v26.2d, v27.2d, v19.2d\n" "uadalp v5.8h, v26.16b\n" - "zip1 v23.2d, v25.2d, v21.2d\n" - "ldr q22, [x23], #0x10\n" + "ldr q25, [x26], #0x10\n" + "ldr q18, [x25], #0x10\n" + "zip1 v24.2d, v25.2d, v18.2d\n" + "uadalp v4.8h, v24.16b\n" + "ldr q23, [x24], #0x10\n" + "ldr q17, [x23], #0x10\n" + "zip1 v22.2d, v23.2d, v17.2d\n" + "uadalp v3.8h, v22.16b\n" + "ldr q21, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v20.2d, v21.2d, v16.2d\n" + "uadalp v2.8h, v20.16b\n" + "zip2 v19.2d, v27.2d, v19.2d\n" + "zip2 v18.2d, v25.2d, v18.2d\n" + "subs %x[width], %x[width], #0x10\n" "cmp %x[width], #0x10\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "ldr q18, [x22], #0x10\n" - "uadalp v4.8h, v23.16b\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "ldr q19, [x21], #0x10\n" - "uadalp v5.8h, v24.16b\n" - "zip2 v18.2d, v22.2d, v18.2d\n" - "ldr q16, [x20], #0x10\n" - "uadalp v3.8h, v20.16b\n" - "zip1 v17.2d, v19.2d, v16.2d\n" + "zip2 v17.2d, v23.2d, v17.2d\n" + "zip2 v16.2d, v21.2d, v16.2d\n" + "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "uadalp v4.8h, v21.16b\n" - "zip2 v16.2d, v19.2d, v16.2d\n" "prfm pldl1keep, [x26, #0x70]\n" - "uadalp v2.8h, v17.16b\n" "prfm pldl1keep, [x25, #0x70]\n" - "uadalp v3.8h, v18.16b\n" + "str q26, [%x[out_ptr], #0x0]\n" + "uadalp v5.8h, v19.16b\n" "prfm pldl1keep, [x24, #0x70]\n" - "uadalp v2.8h, v16.16b\n" "prfm pldl1keep, [x23, #0x70]\n" + "str q24, [%x[out_ptr], #0x10]\n" + "uadalp v4.8h, v18.16b\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "prfm pldl1keep, [x20, #0x70]\n" - "str q26, [%x[out_ptr], #0x0]\n" - "str q23, [%x[out_ptr], #0x10]\n" - "str q20, [%x[out_ptr], #0x20]\n" - "str q17, [%x[out_ptr], #0x30]\n" - "str q24, [%x[out_ptr], #0x40]\n" - "str q21, [%x[out_ptr], #0x50]\n" - "str q18, [%x[out_ptr], #0x60]\n" + "str q22, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v17.16b\n" + "str q20, [%x[out_ptr], #0x30]\n" + "uadalp v2.8h, v16.16b\n" + "add x20, x20, #0x1\n" + "str q19, [%x[out_ptr], #0x40]\n" + "str q18, [%x[out_ptr], #0x50]\n" + "str q17, [%x[out_ptr], #0x60]\n" "str q16, [%x[out_ptr], #0x70]\n" "add %x[out_ptr], %x[out_ptr], #0x80\n" "bge 3b\n" "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d27, [x27], #0x8\n" - "ldr d24, [x26], #0x8\n" - "ldr d25, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" - "ldr d18, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" - "ldr d16, [x20], #0x8\n" + "ldr d27, [x28], #0x8\n" + "ldr d19, [x27], #0x8\n" + "ldr d25, [x26], #0x8\n" + "ldr d18, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d17, [x23], #0x8\n" + "ldr d21, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v27.s }[2], [x27], #0x4\n" - "ld1 { v24.s }[2], [x26], #0x4\n" - "ld1 { v25.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v22.s }[2], [x23], #0x4\n" - "ld1 { v18.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" - "ld1 { v16.s }[2], [x20], #0x4\n" + "ld1 { v27.s }[2], [x28], #0x4\n" + "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v25.s }[2], [x26], #0x4\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v21.s }[2], [x22], #0x4\n" + "ld1 { v16.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v27.h }[6], [x27], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v24.h }[6], [x26], #0x2\n" - "ld1 { v25.h }[6], [x25], #0x2\n" - "ld1 { v21.h }[6], [x24], #0x2\n" - "ld1 { v22.h }[6], [x23], #0x2\n" - "ld1 { v18.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" - "ld1 { v16.h }[6], [x20], #0x2\n" + "ld1 { v27.h }[6], [x28], #0x2\n" + "ld1 { v19.h }[6], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[6], [x26], #0x2\n" + "ld1 { v18.h }[6], [x25], #0x2\n" + "ld1 { v23.h }[6], [x24], #0x2\n" + "ld1 { v17.h }[6], [x23], #0x2\n" + "ld1 { v21.h }[6], [x22], #0x2\n" + "ld1 { v16.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[14], [x27]\n" - "ld1 { v24.b }[14], [x26]\n" - "ld1 { v25.b }[14], [x25]\n" - "ld1 { v21.b }[14], [x24]\n" - "ld1 { v22.b }[14], [x23]\n" - "ld1 { v18.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" - "ld1 { v16.b }[14], [x20]\n" + "ld1 { v27.b }[14], [x28]\n" + "ld1 { v19.b }[14], [x27]\n" + "ld1 { v25.b }[14], [x26]\n" + "ld1 { v18.b }[14], [x25]\n" + "ld1 { v23.b }[14], [x24]\n" + "ld1 { v17.b }[14], [x23]\n" + "ld1 { v21.b }[14], [x22]\n" + "ld1 { v16.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 - "mov x19, #0x2\n" + "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[12], [x27]\n" - "ld1 { v24.b }[12], [x26]\n" - "ld1 { v25.b }[12], [x25]\n" - "ld1 { v21.b }[12], [x24]\n" - "ld1 { v22.b }[12], [x23]\n" - "ld1 { v18.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" - "ld1 { v16.b }[12], [x20]\n" + "ld1 { v27.b }[12], [x28]\n" + "ld1 { v19.b }[12], [x27]\n" + "ld1 { v25.b }[12], [x26]\n" + "ld1 { v18.b }[12], [x25]\n" + "ld1 { v23.b }[12], [x24]\n" + "ld1 { v17.b }[12], [x23]\n" + "ld1 { v21.b }[12], [x22]\n" + "ld1 { v16.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v27.h }[4], [x27], #0x2\n" - "ld1 { v24.h }[4], [x26], #0x2\n" - "mov x19, #0x2\n" - "ld1 { v25.h }[4], [x25], #0x2\n" - "ld1 { v21.h }[4], [x24], #0x2\n" - "ld1 { v22.h }[4], [x23], #0x2\n" - "ld1 { v18.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" - "ld1 { v16.h }[4], [x20], #0x2\n" + "ld1 { v27.h }[4], [x28], #0x2\n" + "ld1 { v19.h }[4], [x27], #0x2\n" + "mov x20, #0x2\n" + "ld1 { v25.h }[4], [x26], #0x2\n" + "ld1 { v18.h }[4], [x25], #0x2\n" + "ld1 { v23.h }[4], [x24], #0x2\n" + "ld1 { v17.h }[4], [x23], #0x2\n" + "ld1 { v21.h }[4], [x22], #0x2\n" + "ld1 { v16.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[10], [x27]\n" - "ld1 { v24.b }[10], [x26]\n" - "ld1 { v25.b }[10], [x25]\n" - "ld1 { v21.b }[10], [x24]\n" - "ld1 { v22.b }[10], [x23]\n" - "ld1 { v18.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" - "ld1 { v16.b }[10], [x20]\n" + "ld1 { v27.b }[10], [x28]\n" + "ld1 { v19.b }[10], [x27]\n" + "ld1 { v25.b }[10], [x26]\n" + "ld1 { v18.b }[10], [x25]\n" + "ld1 { v23.b }[10], [x24]\n" + "ld1 { v17.b }[10], [x23]\n" + "ld1 { v21.b }[10], [x22]\n" + "ld1 { v16.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[8], [x27]\n" - "ld1 { v24.b }[8], [x26]\n" - "mov x19, #0x2\n" - "ld1 { v25.b }[8], [x25]\n" - "ld1 { v21.b }[8], [x24]\n" - "ld1 { v22.b }[8], [x23]\n" - "ld1 { v18.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" - "ld1 { v16.b }[8], [x20]\n" + "ld1 { v27.b }[8], [x28]\n" + "ld1 { v19.b }[8], [x27]\n" + "mov x20, #0x2\n" + "ld1 { v25.b }[8], [x26]\n" + "ld1 { v18.b }[8], [x25]\n" + "ld1 { v23.b }[8], [x24]\n" + "ld1 { v17.b }[8], [x23]\n" + "ld1 { v21.b }[8], [x22]\n" + "ld1 { v16.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s27, [x27], #0x4\n" - "ldr s24, [x26], #0x4\n" - "ldr s25, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s22, [x23], #0x4\n" - "ldr s18, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" - "ldr s16, [x20], #0x4\n" + "ldr s27, [x28], #0x4\n" + "ldr s19, [x27], #0x4\n" + "ldr s25, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "ldr s17, [x23], #0x4\n" + "ldr s21, [x22], #0x4\n" + "ldr s16, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v27.h }[2], [x27], #0x2\n" - "mov x19, #0x1\n" - "ld1 { v24.h }[2], [x26], #0x2\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v22.h }[2], [x23], #0x2\n" - "ld1 { v18.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" - "ld1 { v16.h }[2], [x20], #0x2\n" + "ld1 { v27.h }[2], [x28], #0x2\n" + "ld1 { v19.h }[2], [x27], #0x2\n" + "mov x20, #0x1\n" + "ld1 { v25.h }[2], [x26], #0x2\n" + "ld1 { v18.h }[2], [x25], #0x2\n" + "ld1 { v23.h }[2], [x24], #0x2\n" + "ld1 { v17.h }[2], [x23], #0x2\n" + "ld1 { v21.h }[2], [x22], #0x2\n" + "ld1 { v16.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[6], [x27]\n" - "ld1 { v24.b }[6], [x26]\n" - "ld1 { v25.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v22.b }[6], [x23]\n" - "ld1 { v18.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" - "ld1 { v16.b }[6], [x20]\n" + "ld1 { v27.b }[6], [x28]\n" + "ld1 { v19.b }[6], [x27]\n" + "ld1 { v25.b }[6], [x26]\n" + "ld1 { v18.b }[6], [x25]\n" + "ld1 { v23.b }[6], [x24]\n" + "ld1 { v17.b }[6], [x23]\n" + "ld1 { v21.b }[6], [x22]\n" + "ld1 { v16.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 - "mov x19, #0x1\n" + "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[4], [x27]\n" - "ld1 { v24.b }[4], [x26]\n" - "ld1 { v25.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v22.b }[4], [x23]\n" - "ld1 { v18.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" - "ld1 { v16.b }[4], [x20]\n" + "ld1 { v27.b }[4], [x28]\n" + "ld1 { v19.b }[4], [x27]\n" + "ld1 { v25.b }[4], [x26]\n" + "ld1 { v18.b }[4], [x25]\n" + "ld1 { v23.b }[4], [x24]\n" + "ld1 { v17.b }[4], [x23]\n" + "ld1 { v21.b }[4], [x22]\n" + "ld1 { v16.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h27, [x27], #0x2\n" - "ldr h24, [x26], #0x2\n" - "mov x19, #0x1\n" - "ldr h25, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h22, [x23], #0x2\n" - "ldr h18, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" - "ldr h16, [x20], #0x2\n" + "ldr h27, [x28], #0x2\n" + "ldr h19, [x27], #0x2\n" + "mov x20, #0x1\n" + "ldr h25, [x26], #0x2\n" + "ldr h18, [x25], #0x2\n" + "ldr h23, [x24], #0x2\n" + "ldr h17, [x23], #0x2\n" + "ldr h21, [x22], #0x2\n" + "ldr h16, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v27.b }[2], [x27]\n" - "ld1 { v24.b }[2], [x26]\n" - "ld1 { v25.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v22.b }[2], [x23]\n" - "ld1 { v18.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" - "ld1 { v16.b }[2], [x20]\n" + "ld1 { v27.b }[2], [x28]\n" + "ld1 { v19.b }[2], [x27]\n" + "ld1 { v25.b }[2], [x26]\n" + "ld1 { v18.b }[2], [x25]\n" + "ld1 { v23.b }[2], [x24]\n" + "ld1 { v17.b }[2], [x23]\n" + "ld1 { v21.b }[2], [x22]\n" + "ld1 { v16.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b27, [x27, #0x0]\n" - "mov x19, #0x1\n" - "ldr b24, [x26, #0x0]\n" - "ldr b25, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b22, [x23, #0x0]\n" - "ldr b18, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" - "ldr b16, [x20, #0x0]\n" + "ldr b27, [x28, #0x0]\n" + "ldr b19, [x27, #0x0]\n" + "mov x20, #0x1\n" + "ldr b25, [x26, #0x0]\n" + "ldr b18, [x25, #0x0]\n" + "ldr b23, [x24, #0x0]\n" + "ldr b17, [x23, #0x0]\n" + "ldr b21, [x22, #0x0]\n" + "ldr b16, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v26.2d, v27.2d, v24.2d\n" + "zip1 v26.2d, v27.2d, v19.2d\n" + "zip1 v24.2d, v25.2d, v18.2d\n" + "subs x20, x20, #0x1\n" "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v23.2d, v25.2d, v21.2d\n" + "zip1 v22.2d, v23.2d, v17.2d\n" + "zip1 v20.2d, v21.2d, v16.2d\n" + "str q24, [%x[out_ptr], #0x10]\n" "uadalp v5.8h, v26.16b\n" - "zip1 v20.2d, v22.2d, v18.2d\n" - "str q23, [%x[out_ptr], #0x10]\n" - "uadalp v4.8h, v23.16b\n" - "zip1 v17.2d, v19.2d, v16.2d\n" - "str q20, [%x[out_ptr], #0x20]\n" - "uadalp v3.8h, v20.16b\n" - "str q17, [%x[out_ptr], #0x30]\n" - "uadalp v2.8h, v17.16b\n" - "subs x19, x19, #0x1\n" + "uadalp v4.8h, v24.16b\n" + "str q22, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v22.16b\n" + "str q20, [%x[out_ptr], #0x30]\n" + "uadalp v2.8h, v20.16b\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 14f\n" - "zip2 v24.2d, v27.2d, v24.2d\n" - "str q24, [%x[out_ptr], #0x0]\n" - "zip2 v21.2d, v25.2d, v21.2d\n" - "uadalp v5.8h, v24.16b\n" - "zip2 v18.2d, v22.2d, v18.2d\n" - "str q21, [%x[out_ptr], #0x10]\n" - "uadalp v4.8h, v21.16b\n" - "zip2 v16.2d, v19.2d, v16.2d\n" - "str q18, [%x[out_ptr], #0x20]\n" - "uadalp v3.8h, v18.16b\n" + "zip2 v19.2d, v27.2d, v19.2d\n" + "zip2 v18.2d, v25.2d, v18.2d\n" + "str q19, [%x[out_ptr], #0x0]\n" + "zip2 v17.2d, v23.2d, v17.2d\n" + "zip2 v16.2d, v21.2d, v16.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "uadalp v5.8h, v19.16b\n" + "uadalp v4.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x30]\n" "uadalp v2.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "14:" // Odds skip "uadalp v1.4s, v5.8h\n" "uadalp v0.4s, v4.8h\n" - "addp v1.4s, v1.4s, v0.4s\n" "uadalp v31.4s, v3.8h\n" "uadalp v30.4s, v2.8h\n" - "add v1.4s, v1.4s, v29.4s\n" - "str q1, [%x[out_ptr], #0x0]\n" + "addp v1.4s, v1.4s, v0.4s\n" "addp v0.4s, v31.4s, v30.4s\n" + "add v1.4s, v1.4s, v29.4s\n" "add v0.4s, v0.4s, v28.4s\n" + "str q1, [%x[out_ptr], #0x0]\n" "str q0, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp index c6ff375ea2..51b91d16e1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,105 +34,105 @@ void interleave_block<1, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x22, ALL, MUL #2\n" + "sub x28, %x[width], #0x1\n" "cntw x21, ALL, MUL #2\n" - "sub x27, %x[width], #0x1\n" - "cntw x20, ALL, MUL #2\n" - "sub x19, x21, #0x1\n" + "sub x20, x22, #0x1\n" "whilelt p10.s, XZR, %x[height]\n" - "add x27, x27, x20\n" - "ands x26, %x[width], x19\n" - "udiv x27, x27, x20\n" - "csel x26, x26, x21, NE\n" - "mov x25, #0x0\n" - "and x24, x27, #0x1\n" - "sub x27, x27, #0x1\n" - "add x26, x26, #0x1\n" - "mov x19, %x[width]\n" + "add x28, x28, x21\n" + "ands x27, %x[width], x20\n" + "udiv x28, x28, x21\n" + "csel x27, x27, x22, NE\n" + "mov x26, #0x0\n" + "and x25, x28, #0x1\n" + "sub x28, x28, #0x1\n" + "add x27, x27, #0x1\n" + "mov x20, %x[width]\n" "ptrue p0.b\n" - "mov x23, %x[outptr_raw]\n" - "mov x22, %x[row_offset]\n" - "cntw x21\n" + "mov x24, %x[outptr_raw]\n" + "mov x23, %x[row_offset]\n" + "cntw x22\n" + "lsr x28, x28, #0x1\n" "lsr x27, x27, #0x1\n" - "lsr x26, x26, #0x1\n" "mov x12, #0x0\n" - ".inst 0x25b34731 // whilelt pn9.s, x25, x19, VLx2\n" - "mov x20, %x[in]\n" + ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n" + "mov x21, %x[in]\n" "1:" // Width loop: Preamble: Loop - "ldr x19, [x20], #0x8\n" + "ldr x20, [x21], #0x8\n" ".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n" - ".inst 0xa0164266 // ld1w { z6.s-z7.s }, pn8.s/Z, [x19, x22, LSL #2]\n" + ".inst 0xa0174286 // ld1w { z6.s-z7.s }, pn8.s/Z, [x20, x23, LSL #2]\n" ".inst 0xc160e0c6 // bfcvt z6.h, { z6.s-z7.s }\n" ".inst 0xc08000c0 // mova za0h.s[x12], p0/M, z6.s\n" "add x12, x12, #0x1\n" - "cmp x12, x21\n" + "cmp x12, x22\n" "blt 1b\n" - "incw x22, ALL, MUL #2\n" - "incw x25, ALL, MUL #2\n" - "cbz x27, 5f\n" + "incw x23, ALL, MUL #2\n" + "incw x26, ALL, MUL #2\n" + "cbz x28, 5f\n" "2:" // Width loop - "mov x19, %x[width]\n" + "mov x20, %x[width]\n" "mov x12, #0x0\n" - ".inst 0x25b34731 // whilelt pn9.s, x25, x19, VLx2\n" - "mov x20, %x[in]\n" + ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n" + "mov x21, %x[in]\n" "3:" // Width loop: Odd: Loop - "ldr x19, [x20], #0x8\n" + "ldr x20, [x21], #0x8\n" ".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n" - ".inst 0xa016427e // ld1w { z30.s-z31.s }, pn8.s/Z, [x19, x22, LSL #2]\n" + ".inst 0xa017429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x23, LSL #2]\n" ".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n" ".inst 0xc08003c8 // mova za2h.s[x12], p0/M, z30.s\n" ".inst 0xc082800f // mova z15.s, p0/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x21\n" - "st1w { z15.s }, p0, [x23]\n" - "addvl x23, x23, #1\n" + "cmp x12, x22\n" + "st1w { z15.s }, p0, [x24]\n" + "addvl x24, x24, #1\n" "blt 3b\n" - "incw x25, ALL, MUL #2\n" - "mov x19, %x[width]\n" - "incw x22, ALL, MUL #2\n" + "incw x26, ALL, MUL #2\n" + "mov x20, %x[width]\n" + "incw x23, ALL, MUL #2\n" "mov x12, #0x0\n" - ".inst 0x25b34731 // whilelt pn9.s, x25, x19, VLx2\n" - "mov x20, %x[in]\n" + ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n" + "mov x21, %x[in]\n" "4:" // Width loop: Even: Loop - "ldr x19, [x20], #0x8\n" + "ldr x20, [x21], #0x8\n" ".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n" - ".inst 0xa0164278 // ld1w { z24.s-z25.s }, pn8.s/Z, [x19, x22, LSL #2]\n" + ".inst 0xa0174298 // ld1w { z24.s-z25.s }, pn8.s/Z, [x20, x23, LSL #2]\n" ".inst 0xc160e318 // bfcvt z24.h, { z24.s-z25.s }\n" ".inst 0xc0800300 // mova za0h.s[x12], p0/M, z24.s\n" ".inst 0xc0828110 // mova z16.s, p0/M, za2v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x21\n" - "st1w { z16.s }, p0, [x23]\n" - "addvl x23, x23, #1\n" + "cmp x12, x22\n" + "st1w { z16.s }, p0, [x24]\n" + "addvl x24, x24, #1\n" "blt 4b\n" - "subs x27, x27, #0x1\n" - "incw x22, ALL, MUL #2\n" - "incw x25, ALL, MUL #2\n" + "subs x28, x28, #0x1\n" + "incw x23, ALL, MUL #2\n" + "incw x26, ALL, MUL #2\n" "bgt 2b\n" "5:" // Width loop: Tails - "cbnz x24, 8f\n" - "mov x19, %x[width]\n" + "cbnz x25, 8f\n" + "mov x20, %x[width]\n" "mov x12, #0x0\n" - ".inst 0x25b34731 // whilelt pn9.s, x25, x19, VLx2\n" - "mov x20, %x[in]\n" + ".inst 0x25b44751 // whilelt pn9.s, x26, x20, VLx2\n" + "mov x21, %x[in]\n" "6:" // Width loop: Tails: Even: Odd: Loop - "ldr x19, [x20], #0x8\n" + "ldr x20, [x21], #0x8\n" ".inst 0x25306548 // psel p8.s, p9.s/Z, p10.s[w12]\n" - ".inst 0xa016426e // ld1w { z14.s-z15.s }, pn8.s/Z, [x19, x22, LSL #2]\n" + ".inst 0xa017428e // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x23, LSL #2]\n" ".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n" ".inst 0xc08001c8 // mova za2h.s[x12], p0/M, z14.s\n" ".inst 0xc0828010 // mova z16.s, p0/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x21\n" - "st1w { z16.s }, p0, [x23]\n" - "addvl x23, x23, #1\n" + "cmp x12, x22\n" + "st1w { z16.s }, p0, [x24]\n" + "addvl x24, x24, #1\n" "blt 6b\n" "mov x12, #0x0\n" "7:" // Width loop: Tails: Even: Even: Loop ".inst 0xc0828110 // mova z16.s, p0/M, za2v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x26\n" - "st1w { z16.s }, p0, [x23]\n" - "addvl x23, x23, #1\n" + "cmp x12, x27\n" + "st1w { z16.s }, p0, [x24]\n" + "addvl x24, x24, #1\n" "blt 7b\n" "b 10f\n" "8:" // Width loop: Tails: Odd @@ -140,16 +140,16 @@ void interleave_block<1, 2, VLType::SME, false>( "9:" // Width loop: Tails: Odd: Loop ".inst 0xc0828010 // mova z16.s, p0/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x26\n" - "st1w { z16.s }, p0, [x23]\n" - "addvl x23, x23, #1\n" + "cmp x12, x27\n" + "st1w { z16.s }, p0, [x24]\n" + "addvl x24, x24, #1\n" "blt 9b\n" "10:" // End - "mov %x[outptr_raw], x23\n" + "mov %x[outptr_raw], x24\n" ".inst 0xd503467f // SMSTOP\n" : [outptr_raw] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp index e712eca3ff..25bfad18b1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,62 +34,62 @@ void interleave_block<2, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x22, ALL, MUL #2\n" + "cntw x9\n" + "sub x28, %x[width], #0x1\n" "cntw x21, ALL, MUL #2\n" - "cntw x28\n" - "sub x27, %x[width], #0x1\n" - "cntw x20, ALL, MUL #2\n" - "sub x19, x21, #0x1\n" + "sub x20, x22, #0x1\n" ".inst 0x25207815 // ptrue pn13.b\n" "whilelt p12.s, XZR, %x[height]\n" - "whilelt p11.s, x28, %x[height]\n" - "add x27, x27, x20\n" - "ands x26, %x[width], x19\n" - "udiv x27, x27, x20\n" - "csel x26, x26, x21, NE\n" - "mov x25, #0x0\n" - "and x24, x27, #0x1\n" - "sub x27, x27, #0x1\n" - "add x26, x26, #0x1\n" - "mov x19, %x[width]\n" - "mov x23, %x[in]\n" + "whilelt p11.s, x9, %x[height]\n" + "add x28, x28, x21\n" + "ands x27, %x[width], x20\n" + "udiv x28, x28, x21\n" + "csel x27, x27, x22, NE\n" + "mov x26, #0x0\n" + "and x25, x28, #0x1\n" + "sub x28, x28, #0x1\n" + "add x27, x27, #0x1\n" + "mov x20, %x[width]\n" + "mov x24, %x[in]\n" "ptrue p0.b\n" - "mov x22, %x[outptr_raw]\n" - "mov x21, %x[row_offset]\n" + "mov x23, %x[outptr_raw]\n" + "mov x22, %x[row_offset]\n" + "lsr x28, x28, #0x1\n" "lsr x27, x27, #0x1\n" - "lsr x26, x26, #0x1\n" "mov x12, #0x0\n" - ".inst 0x25b34732 // whilelt pn10.s, x25, x19, VLx2\n" - "add x20, x23, x28, LSL #3\n" + ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n" + "add x21, x24, x9, LSL #3\n" "1:" // Width loop: Preamble: Loop - "ldr x19, [x23], #0x8\n" + "ldr x20, [x24], #0x8\n" ".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n" ".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n" - ".inst 0xa0154678 // ld1w { z24.s-z25.s }, pn9.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x20], #0x8\n" - ".inst 0xa0154276 // ld1w { z22.s-z23.s }, pn8.s/Z, [x19, x21, LSL #2]\n" + ".inst 0xa0164698 // ld1w { z24.s-z25.s }, pn9.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x21], #0x8\n" + ".inst 0xa0164296 // ld1w { z22.s-z23.s }, pn8.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e318 // bfcvt z24.h, { z24.s-z25.s }\n" ".inst 0xc160e2d6 // bfcvt z22.h, { z22.s-z23.s }\n" ".inst 0xc0800300 // mova za0h.s[x12], p0/M, z24.s\n" ".inst 0xc08002c4 // mova za1h.s[x12], p0/M, z22.s\n" "add x12, x12, #0x1\n" - "cmp x12, x28\n" + "cmp x12, x9\n" "blt 1b\n" - "incw x21, ALL, MUL #2\n" - "incw x25, ALL, MUL #2\n" - "cbz x27, 5f\n" + "incw x22, ALL, MUL #2\n" + "incw x26, ALL, MUL #2\n" + "cbz x28, 5f\n" "2:" // Width loop - "mov x19, %x[width]\n" - "mov x23, %x[in]\n" + "mov x20, %x[width]\n" + "mov x24, %x[in]\n" "mov x12, #0x0\n" - ".inst 0x25b34732 // whilelt pn10.s, x25, x19, VLx2\n" - "add x20, x23, x28, LSL #3\n" + ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n" + "add x21, x24, x9, LSL #3\n" "3:" // Width loop: Odd: Loop - "ldr x19, [x23], #0x8\n" + "ldr x20, [x24], #0x8\n" ".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n" ".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n" - ".inst 0xa0154676 // ld1w { z22.s-z23.s }, pn9.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x20], #0x8\n" - ".inst 0xa015426a // ld1w { z10.s-z11.s }, pn8.s/Z, [x19, x21, LSL #2]\n" + ".inst 0xa0164696 // ld1w { z22.s-z23.s }, pn9.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x21], #0x8\n" + ".inst 0xa016428a // ld1w { z10.s-z11.s }, pn8.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e2d6 // bfcvt z22.h, { z22.s-z23.s }\n" ".inst 0xc160e14a // bfcvt z10.h, { z10.s-z11.s }\n" ".inst 0xc08002c8 // mova za2h.s[x12], p0/M, z22.s\n" @@ -97,24 +97,24 @@ void interleave_block<2, 2, VLType::SME, false>( ".inst 0xc0828008 // mova z8.s, p0/M, za0v.s[x12]\n" ".inst 0xc0828089 // mova z9.s, p0/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x28\n" - ".inst 0xa06056c8 // st1w { z8.s-z9.s }, pn13.b, [x22]\n" - "addvl x22, x22, #2\n" + "cmp x12, x9\n" + ".inst 0xa06056e8 // st1w { z8.s-z9.s }, pn13.b, [x23]\n" + "addvl x23, x23, #2\n" "blt 3b\n" - "incw x25, ALL, MUL #2\n" - "mov x19, %x[width]\n" - "mov x23, %x[in]\n" - "incw x21, ALL, MUL #2\n" + "incw x26, ALL, MUL #2\n" + "mov x20, %x[width]\n" + "mov x24, %x[in]\n" + "incw x22, ALL, MUL #2\n" "mov x12, #0x0\n" - ".inst 0x25b34732 // whilelt pn10.s, x25, x19, VLx2\n" - "add x20, x23, x28, LSL #3\n" + ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n" + "add x21, x24, x9, LSL #3\n" "4:" // Width loop: Even: Loop - "ldr x19, [x23], #0x8\n" + "ldr x20, [x24], #0x8\n" ".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n" ".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n" - ".inst 0xa015467a // ld1w { z26.s-z27.s }, pn9.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x20], #0x8\n" - ".inst 0xa015427e // ld1w { z30.s-z31.s }, pn8.s/Z, [x19, x21, LSL #2]\n" + ".inst 0xa016469a // ld1w { z26.s-z27.s }, pn9.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x21], #0x8\n" + ".inst 0xa016429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e35a // bfcvt z26.h, { z26.s-z27.s }\n" ".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n" ".inst 0xc0800340 // mova za0h.s[x12], p0/M, z26.s\n" @@ -122,28 +122,28 @@ void interleave_block<2, 2, VLType::SME, false>( ".inst 0xc0828106 // mova z6.s, p0/M, za2v.s[x12]\n" ".inst 0xc082818e // mova z14.s, p0/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x28\n" - ".inst 0xa16056c6 // st1w { z6.s, z14.s }, pn13.b, [x22]\n" - "addvl x22, x22, #2\n" + "cmp x12, x9\n" + ".inst 0xa16056e6 // st1w { z6.s, z14.s }, pn13.b, [x23]\n" + "addvl x23, x23, #2\n" "blt 4b\n" - "subs x27, x27, #0x1\n" - "incw x21, ALL, MUL #2\n" - "incw x25, ALL, MUL #2\n" + "subs x28, x28, #0x1\n" + "incw x22, ALL, MUL #2\n" + "incw x26, ALL, MUL #2\n" "bgt 2b\n" "5:" // Width loop: Tails - "cbnz x24, 8f\n" - "mov x19, %x[width]\n" - "mov x23, %x[in]\n" + "cbnz x25, 8f\n" + "mov x20, %x[width]\n" + "mov x24, %x[in]\n" "mov x12, #0x0\n" - ".inst 0x25b34732 // whilelt pn10.s, x25, x19, VLx2\n" - "add x20, x23, x28, LSL #3\n" + ".inst 0x25b44752 // whilelt pn10.s, x26, x20, VLx2\n" + "add x21, x24, x9, LSL #3\n" "6:" // Width loop: Tails: Even: Odd: Loop - "ldr x19, [x23], #0x8\n" + "ldr x20, [x24], #0x8\n" ".inst 0x25306989 // psel p9.s, p10.s/Z, p12.s[w12]\n" ".inst 0x25306968 // psel p8.s, p10.s/Z, p11.s[w12]\n" - ".inst 0xa015466c // ld1w { z12.s-z13.s }, pn9.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x20], #0x8\n" - ".inst 0xa015426e // ld1w { z14.s-z15.s }, pn8.s/Z, [x19, x21, LSL #2]\n" + ".inst 0xa016468c // ld1w { z12.s-z13.s }, pn9.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x21], #0x8\n" + ".inst 0xa016428e // ld1w { z14.s-z15.s }, pn8.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n" ".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n" ".inst 0xc0800188 // mova za2h.s[x12], p0/M, z12.s\n" @@ -151,18 +151,18 @@ void interleave_block<2, 2, VLType::SME, false>( ".inst 0xc0828007 // mova z7.s, p0/M, za0v.s[x12]\n" ".inst 0xc082808f // mova z15.s, p0/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x28\n" - ".inst 0xa16056c7 // st1w { z7.s, z15.s }, pn13.b, [x22]\n" - "addvl x22, x22, #2\n" + "cmp x12, x9\n" + ".inst 0xa16056e7 // st1w { z7.s, z15.s }, pn13.b, [x23]\n" + "addvl x23, x23, #2\n" "blt 6b\n" "mov x12, #0x0\n" "7:" // Width loop: Tails: Even: Even: Loop ".inst 0xc082810e // mova z14.s, p0/M, za2v.s[x12]\n" ".inst 0xc082818f // mova z15.s, p0/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x26\n" - ".inst 0xa06056ce // st1w { z14.s-z15.s }, pn13.b, [x22]\n" - "addvl x22, x22, #2\n" + "cmp x12, x27\n" + ".inst 0xa06056ee // st1w { z14.s-z15.s }, pn13.b, [x23]\n" + "addvl x23, x23, #2\n" "blt 7b\n" "b 10f\n" "8:" // Width loop: Tails: Odd @@ -171,16 +171,16 @@ void interleave_block<2, 2, VLType::SME, false>( ".inst 0xc0828014 // mova z20.s, p0/M, za0v.s[x12]\n" ".inst 0xc0828095 // mova z21.s, p0/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x26\n" - ".inst 0xa06056d4 // st1w { z20.s-z21.s }, pn13.b, [x22]\n" - "addvl x22, x22, #2\n" + "cmp x12, x27\n" + ".inst 0xa06056f4 // st1w { z20.s-z21.s }, pn13.b, [x23]\n" + "addvl x23, x23, #2\n" "blt 9b\n" "10:" // End - "mov %x[outptr_raw], x22\n" + "mov %x[outptr_raw], x23\n" ".inst 0xd503467f // SMSTOP\n" : [outptr_raw] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp index e08d6d992e..9255831e86 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,51 +34,51 @@ void interleave_block<4, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x23, ALL, MUL #2\n" + "cntw x10\n" "cntw x22, ALL, MUL #2\n" - "cntw x9\n" - "cntw x21, ALL, MUL #2\n" - "cntw x19, ALL, MUL #3\n" - "sub x20, x22, #0x1\n" + "cntw x20, ALL, MUL #3\n" + "sub x21, x23, #0x1\n" ".inst 0x25207817 // ptrue pn15.b\n" "whilelt p1.s, XZR, %x[height]\n" - "whilelt p14.s, x9, %x[height]\n" - "whilelt p13.s, x21, %x[height]\n" - "whilelt p12.s, x19, %x[height]\n" - "sub x28, %x[width], #0x1\n" - "cntw x19, ALL, MUL #2\n" - "ands x27, %x[width], x20\n" - "mov x26, %x[in]\n" - "add x28, x28, x19\n" - "csel x27, x27, x22, NE\n" - "add x25, x26, x9, LSL #3\n" - "mov x24, #0x0\n" - "udiv x28, x28, x19\n" - "add x27, x27, #0x1\n" - "mov x19, %x[width]\n" - "add x23, x25, x9, LSL #3\n" + "whilelt p14.s, x10, %x[height]\n" + "whilelt p13.s, x22, %x[height]\n" + "whilelt p12.s, x20, %x[height]\n" + "sub x9, %x[width], #0x1\n" + "cntw x20, ALL, MUL #2\n" + "ands x28, %x[width], x21\n" + "mov x27, %x[in]\n" + "add x9, x9, x20\n" + "csel x28, x28, x23, NE\n" + "add x26, x27, x10, LSL #3\n" + "mov x25, #0x0\n" + "udiv x9, x9, x20\n" + "add x28, x28, #0x1\n" + "mov x20, %x[width]\n" + "add x24, x26, x10, LSL #3\n" "ptrue p0.b\n" - "mov x22, %x[outptr_raw]\n" - "mov x21, %x[row_offset]\n" - "sub x28, x28, #0x1\n" - "lsr x27, x27, #0x1\n" + "mov x23, %x[outptr_raw]\n" + "mov x22, %x[row_offset]\n" + "sub x9, x9, #0x1\n" + "lsr x28, x28, #0x1\n" "mov x12, #0x0\n" - ".inst 0x25b34713 // whilelt pn11.s, x24, x19, VLx2\n" - "add x20, x23, x9, LSL #3\n" + ".inst 0x25b44733 // whilelt pn11.s, x25, x20, VLx2\n" + "add x21, x24, x10, LSL #3\n" "1:" // Width loop: Preamble: Loop - "ldr x19, [x26], #0x8\n" + "ldr x20, [x27], #0x8\n" ".inst 0x25306c28 // psel p8.s, p11.s/Z, p1.s[w12]\n" ".inst 0x25306dca // psel p10.s, p11.s/Z, p14.s[w12]\n" - ".inst 0xa0154278 // ld1w { z24.s-z25.s }, pn8.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x25], #0x8\n" + ".inst 0xa0164298 // ld1w { z24.s-z25.s }, pn8.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x26], #0x8\n" ".inst 0x25306da9 // psel p9.s, p11.s/Z, p13.s[w12]\n" ".inst 0x25306d88 // psel p8.s, p11.s/Z, p12.s[w12]\n" - ".inst 0xa0154a62 // ld1w { z2.s-z3.s }, pn10.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x23], #0x8\n" - ".inst 0xa015466a // ld1w { z10.s-z11.s }, pn9.s/Z, [x19, x21, LSL #2]\n" + ".inst 0xa0164a82 // ld1w { z2.s-z3.s }, pn10.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x24], #0x8\n" + ".inst 0xa016468a // ld1w { z10.s-z11.s }, pn9.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e318 // bfcvt z24.h, { z24.s-z25.s }\n" ".inst 0xc160e042 // bfcvt z2.h, { z2.s-z3.s }\n" - "ldr x19, [x20], #0x8\n" - ".inst 0xa015426c // ld1w { z12.s-z13.s }, pn8.s/Z, [x19, x21, LSL #2]\n" + "ldr x20, [x21], #0x8\n" + ".inst 0xa016428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e14a // bfcvt z10.h, { z10.s-z11.s }\n" ".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n" ".inst 0xc0800300 // mova za0h.s[x12], p0/M, z24.s\n" @@ -86,11 +86,11 @@ void interleave_block<4, 2, VLType::SME, false>( ".inst 0xc0800148 // mova za2h.s[x12], p0/M, z10.s\n" ".inst 0xc080018c // mova za3h.s[x12], p0/M, z12.s\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" + "cmp x12, x10\n" "blt 1b\n" - "incw x21, ALL, MUL #2\n" - "incw x24, ALL, MUL #2\n" - "cbz x28, 5f\n" + "incw x22, ALL, MUL #2\n" + "incw x25, ALL, MUL #2\n" + "cbz x9, 5f\n" "2:" // Width loop "mov x12, #0x0\n" "3:" // Width loop: Store: Loop @@ -99,32 +99,32 @@ void interleave_block<4, 2, VLType::SME, false>( ".inst 0xc0828119 // mova z25.s, p0/M, za2v.s[x12]\n" ".inst 0xc082819d // mova z29.s, p0/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - ".inst 0xa160ded1 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x22]\n" - "addvl x22, x22, #4\n" + "cmp x12, x10\n" + ".inst 0xa160def1 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x23]\n" + "addvl x23, x23, #4\n" "blt 3b\n" - "mov x26, %x[in]\n" - "add x25, x26, x9, LSL #3\n" - "mov x19, %x[width]\n" - "add x23, x25, x9, LSL #3\n" + "mov x27, %x[in]\n" + "add x26, x27, x10, LSL #3\n" + "mov x20, %x[width]\n" + "add x24, x26, x10, LSL #3\n" "mov x12, #0x0\n" - ".inst 0x25b34713 // whilelt pn11.s, x24, x19, VLx2\n" - "add x20, x23, x9, LSL #3\n" + ".inst 0x25b44733 // whilelt pn11.s, x25, x20, VLx2\n" + "add x21, x24, x10, LSL #3\n" "4:" // Width loop: Load: Loop - "ldr x19, [x26], #0x8\n" + "ldr x20, [x27], #0x8\n" ".inst 0x25306c28 // psel p8.s, p11.s/Z, p1.s[w12]\n" ".inst 0x25306dca // psel p10.s, p11.s/Z, p14.s[w12]\n" - ".inst 0xa015426c // ld1w { z12.s-z13.s }, pn8.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x25], #0x8\n" + ".inst 0xa016428c // ld1w { z12.s-z13.s }, pn8.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x26], #0x8\n" ".inst 0x25306da9 // psel p9.s, p11.s/Z, p13.s[w12]\n" ".inst 0x25306d88 // psel p8.s, p11.s/Z, p12.s[w12]\n" - ".inst 0xa0154a6e // ld1w { z14.s-z15.s }, pn10.s/Z, [x19, x21, LSL #2]\n" - "ldr x19, [x23], #0x8\n" - ".inst 0xa0154672 // ld1w { z18.s-z19.s }, pn9.s/Z, [x19, x21, LSL #2]\n" + ".inst 0xa0164a8e // ld1w { z14.s-z15.s }, pn10.s/Z, [x20, x22, LSL #2]\n" + "ldr x20, [x24], #0x8\n" + ".inst 0xa0164692 // ld1w { z18.s-z19.s }, pn9.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e18c // bfcvt z12.h, { z12.s-z13.s }\n" ".inst 0xc160e1ce // bfcvt z14.h, { z14.s-z15.s }\n" - "ldr x19, [x20], #0x8\n" - ".inst 0xa015427e // ld1w { z30.s-z31.s }, pn8.s/Z, [x19, x21, LSL #2]\n" + "ldr x20, [x21], #0x8\n" + ".inst 0xa016429e // ld1w { z30.s-z31.s }, pn8.s/Z, [x20, x22, LSL #2]\n" ".inst 0xc160e252 // bfcvt z18.h, { z18.s-z19.s }\n" ".inst 0xc160e3de // bfcvt z30.h, { z30.s-z31.s }\n" ".inst 0xc0800180 // mova za0h.s[x12], p0/M, z12.s\n" @@ -132,11 +132,11 @@ void interleave_block<4, 2, VLType::SME, false>( ".inst 0xc0800248 // mova za2h.s[x12], p0/M, z18.s\n" ".inst 0xc08003cc // mova za3h.s[x12], p0/M, z30.s\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" + "cmp x12, x10\n" "blt 4b\n" - "subs x28, x28, #0x1\n" - "incw x21, ALL, MUL #2\n" - "incw x24, ALL, MUL #2\n" + "subs x9, x9, #0x1\n" + "incw x22, ALL, MUL #2\n" + "incw x25, ALL, MUL #2\n" "bgt 2b\n" "5:" // Width loop: Tails "mov x12, #0x0\n" @@ -146,16 +146,16 @@ void interleave_block<4, 2, VLType::SME, false>( ".inst 0xc0828119 // mova z25.s, p0/M, za2v.s[x12]\n" ".inst 0xc082819d // mova z29.s, p0/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" - ".inst 0xa160ded1 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x22]\n" - "addvl x22, x22, #4\n" + "cmp x12, x28\n" + ".inst 0xa160def1 // st1w { z17.s, z21.s, z25.s, z29.s }, pn15.b, [x23]\n" + "addvl x23, x23, #4\n" "blt 6b\n" "7:" // End - "mov %x[outptr_raw], x22\n" + "mov %x[outptr_raw], x23\n" ".inst 0xd503467f // SMSTOP\n" : [outptr_raw] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp index 3c8c70776a..9b66a6fb10 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,175 +34,175 @@ void interleave_block<1, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "mov x21, %x[width]\n" + "inch x21\n" + "cnth x11\n" + "sub x21, x21, #0x1\n" + "udiv x21, x21, x11\n" // n_passes = ceildiv(width, VL) "mov x20, %x[width]\n" - "inch x20\n" - "cnth x10\n" - "sub x20, x20, #0x1\n" - "udiv x20, x20, x10\n" // n_passes = ceildiv(width, VL) - "mov x19, %x[width]\n" - "sub x9, x10, #0x1\n" - "sub x28, x20, #0x1\n" - "ands x9, x19, x9\n" - "sub x27, x10, #0x2\n" - "lsl x19, %x[height], #0x1\n" // height * 2 - "mov x26, #0x0\n" - "mov x25, %x[in]\n" - "lsr x28, x28, #0x1\n" // n_loops = (n_passes - 1) / 2 - "ldr x24, [x25, #0x0]\n" - "and x23, x20, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "csel x9, x9, x10, NE\n" - "ldr x22, [x25, #0x8]\n" + "sub x10, x11, #0x1\n" + "sub x9, x21, #0x1\n" + "ands x10, x20, x10\n" + "sub x28, x11, #0x2\n" + "lsl x20, %x[height], #0x1\n" // height * 2 + "mov x27, #0x0\n" + "mov x26, %x[in]\n" + "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 + "ldr x25, [x26, #0x0]\n" + "and x24, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "csel x10, x10, x11, NE\n" + "ldr x23, [x26, #0x8]\n" "ptrue p11.h\n" - "whilelt p10.h, XZR, x19\n" - "mov x21, %x[row_offset]\n" - "mov x20, %x[out]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" - "add x25, x25, #0x10\n" + "whilelt p10.h, XZR, x20\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" + "add x26, x26, #0x10\n" "mov x12, #0x0\n" - "cbz x27, 2f\n" + "cbz x28, 2f\n" "1:" // K loop: Charge: Loop ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n" - "ldr x24, [x25, #0x0]\n" - ".inst 0xe05502c1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x21, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" + "cmp x12, x28\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n" - "mov x25, %x[in]\n" - ".inst 0xe05502c1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x21, LSL #1]\n" - "ldr x24, [x25, #0x0]\n" - "inch x21\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" - "inch x26\n" - "cbz x28, 8f\n" - "mov x19, x28\n" + "mov x26, %x[in]\n" + ".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" + "inch x22\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + "inch x27\n" + "cbz x9, 8f\n" + "mov x20, x9\n" "3:" // K loop: Main loop - "whilelt p8.h, x26, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" - "cbz x27, 5f\n" + "cbz x28, 5f\n" "4:" // K loop: Main loop: First: Loop ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550308 // ld1h { za1h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - ".inst 0xe06a8281 // st1h { za0v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" + ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550308 // ld1h { za1h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - "whilelt p9.h, x26, %x[width]\n" - "inch x26\n" - "add x25, x25, #0x10\n" - ".inst 0xe06a8281 // st1h { za0v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" - "addvl x20, x20, #2\n" - "inch x21\n" - "whilelt p8.h, x26, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" + "inch x27\n" + "add x26, x26, #0x10\n" + ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" + "addvl x21, x21, #2\n" + "inch x22\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" - "cbz x27, 7f\n" + "cbz x28, 7f\n" "6:" // K loop: Main loop: Second: Loop ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8288 // st1h { za1v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - ".inst 0xe06a8289 // st1h { za1v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" + ".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8288 // st1h { za1v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - "whilelt p9.h, x26, %x[width]\n" - "subs x19, x19, #0x1\n" - "add x25, x25, #0x10\n" - ".inst 0xe06a8289 // st1h { za1v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" - "addvl x20, x20, #2\n" - "inch x26\n" - "inch x21\n" + "whilelt p9.h, x27, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + ".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" + "addvl x21, x21, #2\n" + "inch x27\n" + "inch x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x23, 11f\n" - "mov x25, %x[in]\n" - "whilelt p8.h, x26, %x[width]\n" + "cbnz x24, 11f\n" + "mov x26, %x[in]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550308 // ld1h { za1h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" - "add x25, x25, #0x8\n" - "addvl x20, x20, #1\n" + "cmp x12, x11\n" + "add x26, x26, #0x8\n" + "addvl x21, x21, #1\n" "blt 9b\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe07f8288 // st1h { za1v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" "blt 10b\n" - "whilelt p9.h, x26, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x20\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp index 81b346c9ba..d0375de76f 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,186 +34,186 @@ void interleave_block<1, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cnth x20\n" + "cnth x22\n" + "mov x21, %x[width]\n" + "inch x21\n" + "mov x20, %x[width]\n" + "sub x11, x22, #0x1\n" + "sub x21, x21, #0x1\n" + "ands x11, x20, x11\n" "cntw x10\n" - "mov x19, %x[width]\n" - "inch x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x9, x19, #0x1\n" - "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x28, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x27, x20, #0x1\n" - "ands x27, x19, x27\n" - "csel x27, x27, x20, NE\n" - "add x27, x27, #0x1\n" - "lsr x27, x27, #0x1\n" - "sub x26, x10, #0x2\n" + "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL) + "csel x11, x11, x22, NE\n" + "sub x9, x21, #0x1\n" + "add x11, x11, #0x1\n" + "sub x28, x10, #0x2\n" + "lsl x20, %x[height], #0x1\n" // height * 2 + "mov x27, #0x0\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" + "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x24, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "ldr x23, [x26, #0x8]\n" + "lsr x11, x11, #0x1\n" "ptrue p11.s\n" - "lsl x19, %x[height], #0x1\n" // height * 2 - "whilelt p10.h, XZR, x19\n" - "mov x25, %x[row_offset]\n" - "mov x24, %x[out]\n" - "mov x23, #0x0\n" - "whilelt p9.h, x23, %x[width]\n" - "whilelt p8.h, x23, %x[width]\n" - "mov x22, %x[in]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + "whilelt p10.h, XZR, x20\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" + "add x26, x26, #0x10\n" "mov x12, #0x0\n" - "cbz x26, 2f\n" + "cbz x28, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25286140 // dup p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe05902a0 // ld1h { za0h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n" - ".inst 0x25686140 // dup p0.h, p8.h/Z, p10.h[w12, #2]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0590282 // ld1h { za0h.h[x12, #2] }, p0/Z, [x20, x25, LSL #1]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0xe05602e2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" "add x12, x12, #0x4\n" - "cmp x12, x26, LSL #1\n" + "cmp x12, x28, LSL #1\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25286140 // dup p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe05902a0 // ld1h { za0h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n" - ".inst 0x25686140 // dup p0.h, p8.h/Z, p10.h[w12, #2]\n" - "mov x22, %x[in]\n" - ".inst 0xe0590282 // ld1h { za0h.h[x12, #2] }, p0/Z, [x20, x25, LSL #1]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - "inch x25\n" - "inch x23\n" + ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0x25686140 // psel p0.h, p8.h/Z, p10.h[w12, #2]\n" + "mov x26, %x[in]\n" + ".inst 0xe05602e2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" + "inch x22\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + "inch x27\n" "cbz x9, 8f\n" - "mov x19, x9\n" + "mov x20, x9\n" "3:" // K loop: Main loop - "whilelt p8.h, x23, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x26, 5f\n" + "cbz x28, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25396140 // dup p0.h, p8.h/Z, p10.h[w13, #1]\n" - ".inst 0xe05922a1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x21, x25, LSL #1]\n" - ".inst 0x25796140 // dup p0.h, p8.h/Z, p10.h[w13, #3]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0592283 // ld1h { za0h.h[x13, #3] }, p0/Z, [x20, x25, LSL #1]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x4\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" + ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n" + ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0x25796141 // psel p1.h, p8.h/Z, p10.h[w13, #3]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe05626e3 // ld1h { za0h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x26\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" + "add x13, x13, #0x4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x22, %x[in]\n" - ".inst 0x25396140 // dup p0.h, p8.h/Z, p10.h[w13, #1]\n" - ".inst 0xe05922a1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x21, x25, LSL #1]\n" - ".inst 0x25796140 // dup p0.h, p8.h/Z, p10.h[w13, #3]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0592283 // ld1h { za0h.h[x13, #3] }, p0/Z, [x20, x25, LSL #1]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.h, x23, %x[width]\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" - "inch x23\n" - "inch x25\n" - "whilelt p8.h, x23, %x[width]\n" + ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n" + ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25796141 // psel p1.h, p8.h/Z, p10.h[w13, #3]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe05626e3 // ld1h { za0h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "whilelt p9.h, x27, %x[width]\n" + "inch x27\n" + "add x26, x26, #0x10\n" + ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" + "addvl x21, x21, #2\n" + "inch x22\n" + "whilelt p8.h, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x26, 7f\n" + "cbz x28, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25296140 // dup p0.h, p8.h/Z, p10.h[w13]\n" - ".inst 0xe05922a0 // ld1h { za0h.h[x13] }, p0/Z, [x21, x25, LSL #1]\n" - ".inst 0x25696140 // dup p0.h, p8.h/Z, p10.h[w13, #2]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0592282 // ld1h { za0h.h[x13, #2] }, p0/Z, [x20, x25, LSL #1]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x4\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" + ".inst 0x25296140 // psel p0.h, p8.h/Z, p10.h[w13]\n" + ".inst 0xe0562320 // ld1h { za0h.h[x13] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0x25696141 // psel p1.h, p8.h/Z, p10.h[w13, #2]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe05626e2 // ld1h { za0h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x26\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" + "add x13, x13, #0x4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x22, %x[in]\n" - ".inst 0x25296140 // dup p0.h, p8.h/Z, p10.h[w13]\n" - ".inst 0xe05922a0 // ld1h { za0h.h[x13] }, p0/Z, [x21, x25, LSL #1]\n" - ".inst 0x25696140 // dup p0.h, p8.h/Z, p10.h[w13, #2]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0592282 // ld1h { za0h.h[x13, #2] }, p0/Z, [x20, x25, LSL #1]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.h, x23, %x[width]\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" - "inch x23\n" - "inch x25\n" - "subs x19, x19, #0x1\n" + ".inst 0x25296140 // psel p0.h, p8.h/Z, p10.h[w13]\n" + ".inst 0xe0562320 // ld1h { za0h.h[x13] }, p0/Z, [x25, x22, LSL #1]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25696141 // psel p1.h, p8.h/Z, p10.h[w13, #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe05626e2 // ld1h { za0h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "whilelt p9.h, x27, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + ".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" + "addvl x21, x21, #2\n" + "inch x27\n" + "inch x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x28, 11f\n" - "mov x22, %x[in]\n" - "whilelt p8.h, x23, %x[width]\n" + "cbnz x24, 11f\n" + "mov x26, %x[in]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25396140 // dup p0.h, p8.h/Z, p10.h[w13, #1]\n" - "addvl x24, x24, #1\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe05922a1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x21, x25, LSL #1]\n" - "add x22, x22, #0x8\n" - "add x13, x13, #0x2\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x25, [x26, #0x0]\n" "add x12, x12, #0x1\n" + ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n" "cmp x12, x10\n" + ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n" + "add x26, x26, #0x8\n" + "addvl x21, x21, #1\n" + "add x13, x13, #0x2\n" "blt 9b\n" - "whilelt p9.h, x23, %x[width]\n" - "whilelt p8.h, x23, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" - "add x19, x19, #0x2\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" + "cmp x12, x11\n" + "addvl x21, x21, #1\n" + "add x20, x20, #0x2\n" "blt 10b\n" - "whilelt p9.h, x23, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" + "cmp x12, x11\n" + "addvl x21, x21, #1\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x24\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p8", "p9", "p10", "p11", "x9", "x10", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp index bee3cc5649..622d9aa4fc 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,189 +34,189 @@ void interleave_block<1, 4, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cntb x20\n" - "mov x22, %x[width]\n" - "incb x22\n" - "mov x19, %x[width]\n" - "sub x9, x20, #0x1\n" - "cntw x28\n" - "sub x22, x22, #0x1\n" - "ands x9, x19, x9\n" - "udiv x22, x22, x20\n" // n_passes = ceildiv(width, VL) - "csel x9, x9, x20, NE\n" - "lsl x21, %x[height], #0x1\n" // height * 2 - "lsl x20, x28, #0x1\n" - "sub x19, x22, #0x1\n" - "add x9, x9, #0x3\n" - "sub x27, x28, #0x2\n" - "whilelt p9.b, XZR, x21\n" - "whilelt p8.b, x20, x21\n" - "mov x26, #0x0\n" - "mov x25, %x[in]\n" - "lsr x19, x19, #0x1\n" // n_loops = (n_passes - 1) / 2 - "ldr x24, [x25, #0x0]\n" - "and x23, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "lsr x9, x9, #0x2\n" - "ldr x22, [x25, #0x8]\n" + "cntb x21\n" + "mov x23, %x[width]\n" + "incb x23\n" + "mov x20, %x[width]\n" + "sub x10, x21, #0x1\n" + "cntw x9\n" + "sub x23, x23, #0x1\n" + "ands x10, x20, x10\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x10, x10, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x9, #0x1\n" + "sub x20, x23, #0x1\n" + "add x10, x10, #0x3\n" + "sub x28, x9, #0x2\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x27, #0x0\n" + "mov x26, %x[in]\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "ldr x25, [x26, #0x0]\n" + "and x24, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "lsr x10, x10, #0x2\n" + "ldr x23, [x26, #0x8]\n" "ptrue p11.s\n" "zip1 p10.b, p9.b, p8.b\n" - "mov x21, %x[row_offset]\n" - "mov x20, %x[out]\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "add x25, x25, #0x10\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.b, x27, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" + "add x26, x26, #0x10\n" "mov x12, #0x0\n" - "cbz x27, 2f\n" + "cbz x28, 2f\n" "1:" // K loop: Charge: Loop ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe0150300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x21]\n" + ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n" ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" - "ldr x24, [x25, #0x0]\n" - ".inst 0xe01502c4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x22, x21]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n" "add x12, x12, #0x8\n" - "cmp x12, x27, LSL #2\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" + "cmp x12, x28, LSL #2\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe0150300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x21]\n" + ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n" ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" - "mov x25, %x[in]\n" - ".inst 0xe01502c4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x22, x21]\n" - "ldr x24, [x25, #0x0]\n" - "incb x21\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" - "incb x26\n" - "cbz x19, 8f\n" - "mov x19, x19\n" + "mov x26, %x[in]\n" + ".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n" + "ldr x25, [x26, #0x0]\n" + "incb x22\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + "incb x27\n" + "cbz x20, 8f\n" + "mov x20, x20\n" "3:" // K loop: Main loop - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x27, 5f\n" + "cbz x28, 5f\n" "4:" // K loop: Main loop: First: Loop ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0152302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x21]\n" + ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0bc8281 // st1w { za0v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" + ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "add x13, x13, #0x8\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0152302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x21]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - "incb x26\n" - "add x25, x25, #0x10\n" - ".inst 0xe0bc8281 // st1w { za0v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "addvl x20, x20, #2\n" - "incb x21\n" - "whilelt p8.b, x26, %x[width]\n" + "whilelt p9.b, x27, %x[width]\n" + "incb x27\n" + "add x26, x26, #0x10\n" + ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" + "addvl x21, x21, #2\n" + "incb x22\n" + "whilelt p8.b, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x27, 7f\n" + "cbz x28, 7f\n" "6:" // K loop: Main loop: Second: Loop ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe0152300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x21]\n" + ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n" ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8288 // st1w { za2v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0bc8289 // st1w { za2v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" + ".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "add x13, x13, #0x8\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe0152300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x21]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8288 // st1w { za2v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - "subs x19, x19, #0x1\n" - "add x25, x25, #0x10\n" - ".inst 0xe0bc8289 // st1w { za2v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "addvl x20, x20, #2\n" - "incb x26\n" - "incb x21\n" + "whilelt p9.b, x27, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + ".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" + "addvl x21, x21, #2\n" + "incb x27\n" + "incb x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x23, 11f\n" - "mov x25, %x[in]\n" - "whilelt p8.b, x26, %x[width]\n" + "cbnz x24, 11f\n" + "mov x26, %x[in]\n" + "whilelt p8.b, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x25, [x26, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - "cmp x12, x28\n" - ".inst 0xe0152302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x21]\n" - "add x25, x25, #0x8\n" - "addvl x20, x20, #1\n" + "cmp x12, x9\n" + ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" + "add x26, x26, #0x8\n" + "addvl x21, x21, #1\n" "add x13, x13, #0x4\n" "blt 9b\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x27, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8288 // st1w { za2v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" - "add x19, x19, #0x4\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" + "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x26, %x[width]\n" + "whilelt p9.b, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x20\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp index 3ba1b98b73..07f03702d9 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -32,220 +32,220 @@ void interleave_block<1, 4, VLType::SME, true>( { __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntb x21\n" + "mov x23, %x[width]\n" "mov z18.b, #0x1\n" + "incb x23\n" + "mov x20, %x[width]\n" "mov z17.s, #0x0\n" - "cntb x20\n" - "cntw x10\n" - "ptrue p1.b\n" - "mov x19, %x[width]\n" - "incb x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x9, x19, #0x1\n" - "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x28, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x27, x20, #0x1\n" - "ands x27, x19, x27\n" - "csel x27, x27, x20, NE\n" - "add x27, x27, #0x3\n" - "lsr x27, x27, #0x2\n" - "sub x26, x10, #0x2\n" + "sub x10, x21, #0x1\n" + "cntw x9\n" + "sub x23, x23, #0x1\n" + "ands x10, x20, x10\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x10, x10, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x9, #0x1\n" + "sub x20, x23, #0x1\n" + "add x10, x10, #0x3\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x28, #0x0\n" + "ptrue p2.b\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x27, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "lsr x10, x10, #0x2\n" + "sub x26, x9, #0x2\n" "ptrue p11.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x10, #0x1\n" - "whilelt p9.b, XZR, x20\n" - "whilelt p8.b, x19, x20\n" "zip1 p10.b, p9.b, p8.b\n" "mov x25, %x[row_offset]\n" "mov x24, %x[out]\n" - "mov x23, #0x0\n" - "whilelt p9.b, x23, %x[width]\n" - "whilelt p8.b, x23, %x[width]\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "cbnz %x[first], 1f\n" "addvl x24, x24, #-1\n" - "ld1w { z17.s }, p1/Z, [x24]\n" + "ld1w { z17.s }, p2/Z, [x24]\n" "1:" // K loop: Load row sums: End - "mov x22, %x[in]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + "mov x23, %x[in]\n" + "ldr x22, [x23, #0x0]\n" "mov x12, #0x0\n" + "ldr x21, [x23, #0x8]\n" + "add x23, x23, #0x10\n" "cbz x26, 3f\n" "2:" // K loop: Charge: Loop - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01902a0 // ld1b { za0h.b[x12] }, p0/Z, [x21, x25]\n" - ".inst 0x25646140 // dup p0.b, p8.b/Z, p10.b[w12, #4]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0190284 // ld1b { za0h.b[x12, #4] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n" + ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n" "add x12, x12, #0x8\n" "cmp x12, x26, LSL #2\n" + "ldr x21, [x23, #0x8]\n" + "add x23, x23, #0x10\n" "blt 2b\n" "3:" // K loop: Charge: End - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01902a0 // ld1b { za0h.b[x12] }, p0/Z, [x21, x25]\n" - ".inst 0x25646140 // dup p0.b, p8.b/Z, p10.b[w12, #4]\n" - "mov x22, %x[in]\n" - ".inst 0xe0190284 // ld1b { za0h.b[x12, #4] }, p0/Z, [x20, x25]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n" + ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" + "mov x23, %x[in]\n" + ".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n" + "ldr x22, [x23, #0x0]\n" "incb x25\n" - "incb x23\n" - "cbz x9, 9f\n" - "mov x19, x9\n" + "ldr x21, [x23, #0x8]\n" + "add x23, x23, #0x10\n" + "incb x28\n" + "cbz x20, 9f\n" + "mov x20, x20\n" "4:" // K loop: Main loop - "whilelt p8.b, x23, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x26, 6f\n" "5:" // K loop: Main loop: First: Loop - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01922a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x25]\n" - ".inst 0x25756140 // dup p0.b, p8.b/Z, p10.b[w13, #6]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192286 // ld1b { za0h.b[x13, #6] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" "sdot z17.s, z16.b, z18.b\n" - ".inst 0xc0828430 // mova z16.s, p1/M, za0v.s[x12, #1]\n" - "addvl x24, x24, #2\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n" + ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x26\n" "sdot z17.s, z16.b, z18.b\n" + "add x23, x23, #0x10\n" + "addvl x24, x24, #2\n" + "add x13, x13, #0x8\n" "blt 5b\n" "6:" // K loop: Main loop: First: Tail - "mov x22, %x[in]\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01922a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x25]\n" - ".inst 0x25756140 // dup p0.b, p8.b/Z, p10.b[w13, #6]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192286 // ld1b { za0h.b[x13, #6] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x23, %x[width]\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" + "sdot z17.s, z16.b, z18.b\n" + "mov x23, %x[in]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8700 // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n" + "whilelt p9.b, x28, %x[width]\n" + "incb x28\n" + "add x23, x23, #0x10\n" + ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "sdot z17.s, z16.b, z18.b\n" - ".inst 0xc0828430 // mova z16.s, p1/M, za0v.s[x12, #1]\n" "addvl x24, x24, #2\n" - "incb x23\n" "incb x25\n" - "sdot z17.s, z16.b, z18.b\n" - "whilelt p8.b, x23, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x26, 8f\n" "7:" // K loop: Main loop: Second: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01922a0 // ld1b { za0h.b[x13] }, p0/Z, [x21, x25]\n" - ".inst 0x25656140 // dup p0.b, p8.b/Z, p10.b[w13, #4]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192284 // ld1b { za0h.b[x13, #4] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828510 // mova z16.s, p1/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" "sdot z17.s, z16.b, z18.b\n" - ".inst 0xc0828530 // mova z16.s, p1/M, za2v.s[x12, #1]\n" - "addvl x24, x24, #2\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n" + ".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x26\n" "sdot z17.s, z16.b, z18.b\n" + "add x23, x23, #0x10\n" + "addvl x24, x24, #2\n" + "add x13, x13, #0x8\n" "blt 7b\n" "8:" // K loop: Main loop: Second: Tail - "mov x22, %x[in]\n" - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01922a0 // ld1b { za0h.b[x13] }, p0/Z, [x21, x25]\n" - ".inst 0x25656140 // dup p0.b, p8.b/Z, p10.b[w13, #4]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192284 // ld1b { za0h.b[x13, #4] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828510 // mova z16.s, p1/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x23, %x[width]\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" + "sdot z17.s, z16.b, z18.b\n" + "mov x23, %x[in]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8708 // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n" + "whilelt p9.b, x28, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x23, x23, #0x10\n" + ".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "sdot z17.s, z16.b, z18.b\n" - ".inst 0xc0828530 // mova z16.s, p1/M, za2v.s[x12, #1]\n" "addvl x24, x24, #2\n" - "incb x23\n" + "incb x28\n" "incb x25\n" - "sdot z17.s, z16.b, z18.b\n" - "subs x19, x19, #0x1\n" "bgt 4b\n" "9:" // K loop: Tails - "cbnz x28, 12f\n" - "mov x22, %x[in]\n" - "whilelt p8.b, x23, %x[width]\n" + "cbnz x27, 12f\n" + "mov x23, %x[in]\n" + "whilelt p8.b, x28, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: First - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - "addvl x24, x24, #1\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe01922a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x25]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" + "add x12, x12, #0x1\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" "sdot z17.s, z16.b, z18.b\n" - "add x22, x22, #0x8\n" + ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + "cmp x12, x9\n" + "add x23, x23, #0x8\n" + "addvl x24, x24, #1\n" "add x13, x13, #0x4\n" - "add x12, x12, #0x1\n" - "cmp x12, x10\n" "blt 10b\n" - "whilelt p9.b, x23, %x[width]\n" - "whilelt p8.b, x23, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "11:" // K loop: Tails: Even: Second - ".inst 0xc0828510 // mova z16.s, p1/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" - "add x19, x19, #0x4\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" "add x12, x12, #0x1\n" + "cmp x12, x10\n" "sdot z17.s, z16.b, z18.b\n" - "cmp x12, x27\n" + "addvl x24, x24, #1\n" + "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x23, %x[width]\n" + "whilelt p9.b, x28, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" "13:" // K loop: Tails: Odd: Loop - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" + "cmp x12, x10\n" "sdot z17.s, z16.b, z18.b\n" + "addvl x24, x24, #1\n" "blt 13b\n" "14:" // K loop: End - "st1w { z17.s }, p1, [x24]\n" + "st1w { z17.s }, p2, [x24]\n" "addvl x24, x24, #1\n" "mov %x[out], x24\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "x9", "x10", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp index 881dfe103e..618570de08 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,189 +34,189 @@ void interleave_block<1, 4, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cntb x20\n" - "mov x22, %x[width]\n" - "incb x22\n" - "mov x19, %x[width]\n" - "sub x9, x20, #0x1\n" - "cntw x28\n" - "sub x22, x22, #0x1\n" - "ands x9, x19, x9\n" - "udiv x22, x22, x20\n" // n_passes = ceildiv(width, VL) - "csel x9, x9, x20, NE\n" - "lsl x21, %x[height], #0x1\n" // height * 2 - "lsl x20, x28, #0x1\n" - "sub x19, x22, #0x1\n" - "add x9, x9, #0x3\n" - "sub x27, x28, #0x2\n" - "whilelt p9.b, XZR, x21\n" - "whilelt p8.b, x20, x21\n" - "mov x26, #0x0\n" - "mov x25, %x[in]\n" - "lsr x19, x19, #0x1\n" // n_loops = (n_passes - 1) / 2 - "ldr x24, [x25, #0x0]\n" - "and x23, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "lsr x9, x9, #0x2\n" - "ldr x22, [x25, #0x8]\n" + "cntb x21\n" + "mov x23, %x[width]\n" + "incb x23\n" + "mov x20, %x[width]\n" + "sub x10, x21, #0x1\n" + "cntw x9\n" + "sub x23, x23, #0x1\n" + "ands x10, x20, x10\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x10, x10, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x9, #0x1\n" + "sub x20, x23, #0x1\n" + "add x10, x10, #0x3\n" + "sub x28, x9, #0x2\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x27, #0x0\n" + "mov x26, %x[in]\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "ldr x25, [x26, #0x0]\n" + "and x24, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "lsr x10, x10, #0x2\n" + "ldr x23, [x26, #0x8]\n" "ptrue p11.s\n" "zip1 p10.b, p9.b, p8.b\n" - "mov x21, %x[row_offset]\n" - "mov x20, %x[out]\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "add x25, x25, #0x10\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.b, x27, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" + "add x26, x26, #0x10\n" "mov x12, #0x0\n" - "cbz x27, 2f\n" + "cbz x28, 2f\n" "1:" // K loop: Charge: Loop ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe0150300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x21]\n" + ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n" ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" - "ldr x24, [x25, #0x0]\n" - ".inst 0xe01502c4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x22, x21]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n" "add x12, x12, #0x8\n" - "cmp x12, x27, LSL #2\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" + "cmp x12, x28, LSL #2\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe0150300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x21]\n" + ".inst 0xe0160320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x22]\n" ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" - "mov x25, %x[in]\n" - ".inst 0xe01502c4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x22, x21]\n" - "ldr x24, [x25, #0x0]\n" - "incb x21\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" - "incb x26\n" - "cbz x19, 8f\n" - "mov x19, x19\n" + "mov x26, %x[in]\n" + ".inst 0xe01602e4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x23, x22]\n" + "ldr x25, [x26, #0x0]\n" + "incb x22\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + "incb x27\n" + "cbz x20, 8f\n" + "mov x20, x20\n" "3:" // K loop: Main loop - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x27, 5f\n" + "cbz x28, 5f\n" "4:" // K loop: Main loop: First: Loop ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0152302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x21]\n" + ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0bc8281 // st1w { za0v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" + ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "add x13, x13, #0x8\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0152302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x21]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - "incb x26\n" - "add x25, x25, #0x10\n" - ".inst 0xe0bc8281 // st1w { za0v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "addvl x20, x20, #2\n" - "incb x21\n" - "whilelt p8.b, x26, %x[width]\n" + "whilelt p9.b, x27, %x[width]\n" + "incb x27\n" + "add x26, x26, #0x10\n" + ".inst 0xe0a982a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" + "addvl x21, x21, #2\n" + "incb x22\n" + "whilelt p8.b, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x27, 7f\n" + "cbz x28, 7f\n" "6:" // K loop: Main loop: Second: Loop ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe0152300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x21]\n" + ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n" ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8288 // st1w { za2v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0bc8289 // st1w { za2v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" + ".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "add x13, x13, #0x8\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe0152300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x21]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0162320 // ld1b { za0h.b[x13] }, p0/Z, [x25, x22]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe01526c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x21]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe0bf8288 // st1w { za2v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe01626e4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x23, x22]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - "subs x19, x19, #0x1\n" - "add x25, x25, #0x10\n" - ".inst 0xe0bc8289 // st1w { za2v.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "addvl x20, x20, #2\n" - "incb x26\n" - "incb x21\n" + "whilelt p9.b, x27, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + ".inst 0xe0a982a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x9, LSL #2]\n" + "addvl x21, x21, #2\n" + "incb x27\n" + "incb x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x23, 11f\n" - "mov x25, %x[in]\n" - "whilelt p8.b, x26, %x[width]\n" + "cbnz x24, 11f\n" + "mov x26, %x[in]\n" + "whilelt p8.b, x27, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x25, [x26, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - "cmp x12, x28\n" - ".inst 0xe0152302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x21]\n" - "add x25, x25, #0x8\n" - "addvl x20, x20, #1\n" + "cmp x12, x9\n" + ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" + "add x26, x26, #0x8\n" + "addvl x21, x21, #1\n" "add x13, x13, #0x4\n" "blt 9b\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x27, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8288 // st1w { za2v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" - "add x19, x19, #0x4\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" + "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x26, %x[width]\n" + "whilelt p9.b, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8280 // st1w { za0v.s[x12] }, p0/Z, [x20, XZR, LSL #2]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x20\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp index 231d7ae213..646db0caa8 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -32,220 +32,220 @@ void interleave_block<1, 4, VLType::SME, true>( { __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntb x21\n" + "mov x23, %x[width]\n" "mov z18.b, #0x1\n" + "incb x23\n" + "mov x20, %x[width]\n" "mov z17.s, #0x0\n" - "cntb x20\n" - "cntw x10\n" - "ptrue p1.b\n" - "mov x19, %x[width]\n" - "incb x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x9, x19, #0x1\n" - "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x28, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x27, x20, #0x1\n" - "ands x27, x19, x27\n" - "csel x27, x27, x20, NE\n" - "add x27, x27, #0x3\n" - "lsr x27, x27, #0x2\n" - "sub x26, x10, #0x2\n" + "sub x10, x21, #0x1\n" + "cntw x9\n" + "sub x23, x23, #0x1\n" + "ands x10, x20, x10\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x10, x10, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x9, #0x1\n" + "sub x20, x23, #0x1\n" + "add x10, x10, #0x3\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x28, #0x0\n" + "ptrue p2.b\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x27, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "lsr x10, x10, #0x2\n" + "sub x26, x9, #0x2\n" "ptrue p11.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x10, #0x1\n" - "whilelt p9.b, XZR, x20\n" - "whilelt p8.b, x19, x20\n" "zip1 p10.b, p9.b, p8.b\n" "mov x25, %x[row_offset]\n" "mov x24, %x[out]\n" - "mov x23, #0x0\n" - "whilelt p9.b, x23, %x[width]\n" - "whilelt p8.b, x23, %x[width]\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "cbnz %x[first], 1f\n" "addvl x24, x24, #-1\n" - "ld1w { z17.s }, p1/Z, [x24]\n" + "ld1w { z17.s }, p2/Z, [x24]\n" "1:" // K loop: Load row sums: End - "mov x22, %x[in]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + "mov x23, %x[in]\n" + "ldr x22, [x23, #0x0]\n" "mov x12, #0x0\n" + "ldr x21, [x23, #0x8]\n" + "add x23, x23, #0x10\n" "cbz x26, 3f\n" "2:" // K loop: Charge: Loop - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01902a0 // ld1b { za0h.b[x12] }, p0/Z, [x21, x25]\n" - ".inst 0x25646140 // dup p0.b, p8.b/Z, p10.b[w12, #4]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0190284 // ld1b { za0h.b[x12, #4] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n" + ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n" "add x12, x12, #0x8\n" "cmp x12, x26, LSL #2\n" + "ldr x21, [x23, #0x8]\n" + "add x23, x23, #0x10\n" "blt 2b\n" "3:" // K loop: Charge: End - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01902a0 // ld1b { za0h.b[x12] }, p0/Z, [x21, x25]\n" - ".inst 0x25646140 // dup p0.b, p8.b/Z, p10.b[w12, #4]\n" - "mov x22, %x[in]\n" - ".inst 0xe0190284 // ld1b { za0h.b[x12, #4] }, p0/Z, [x20, x25]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01902c0 // ld1b { za0h.b[x12] }, p0/Z, [x22, x25]\n" + ".inst 0x25646140 // psel p0.b, p8.b/Z, p10.b[w12, #4]\n" + "mov x23, %x[in]\n" + ".inst 0xe01902a4 // ld1b { za0h.b[x12, #4] }, p0/Z, [x21, x25]\n" + "ldr x22, [x23, #0x0]\n" "incb x25\n" - "incb x23\n" - "cbz x9, 9f\n" - "mov x19, x9\n" + "ldr x21, [x23, #0x8]\n" + "add x23, x23, #0x10\n" + "incb x28\n" + "cbz x20, 9f\n" + "mov x20, x20\n" "4:" // K loop: Main loop - "whilelt p8.b, x23, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x26, 6f\n" "5:" // K loop: Main loop: First: Loop - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01922a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x25]\n" - ".inst 0x25756140 // dup p0.b, p8.b/Z, p10.b[w13, #6]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192286 // ld1b { za0h.b[x13, #6] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" "udot z17.s, z16.b, z18.b\n" - ".inst 0xc0828430 // mova z16.s, p1/M, za0v.s[x12, #1]\n" - "addvl x24, x24, #2\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n" + ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x26\n" "udot z17.s, z16.b, z18.b\n" + "add x23, x23, #0x10\n" + "addvl x24, x24, #2\n" + "add x13, x13, #0x8\n" "blt 5b\n" "6:" // K loop: Main loop: First: Tail - "mov x22, %x[in]\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01922a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x25]\n" - ".inst 0x25756140 // dup p0.b, p8.b/Z, p10.b[w13, #6]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192286 // ld1b { za0h.b[x13, #6] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x23, %x[width]\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0xe01922a6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" + "udot z17.s, z16.b, z18.b\n" + "mov x23, %x[in]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828830 // mova z16.s, p2/M, za0v.s[x12, #1]\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8700 // st1w { za0v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n" + "whilelt p9.b, x28, %x[width]\n" + "incb x28\n" + "add x23, x23, #0x10\n" + ".inst 0xe0a98301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "udot z17.s, z16.b, z18.b\n" - ".inst 0xc0828430 // mova z16.s, p1/M, za0v.s[x12, #1]\n" "addvl x24, x24, #2\n" - "incb x23\n" "incb x25\n" - "udot z17.s, z16.b, z18.b\n" - "whilelt p8.b, x23, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x26, 8f\n" "7:" // K loop: Main loop: Second: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01922a0 // ld1b { za0h.b[x13] }, p0/Z, [x21, x25]\n" - ".inst 0x25656140 // dup p0.b, p8.b/Z, p10.b[w13, #4]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192284 // ld1b { za0h.b[x13, #4] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828510 // mova z16.s, p1/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" "udot z17.s, z16.b, z18.b\n" - ".inst 0xc0828530 // mova z16.s, p1/M, za2v.s[x12, #1]\n" - "addvl x24, x24, #2\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n" + ".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x26\n" "udot z17.s, z16.b, z18.b\n" + "add x23, x23, #0x10\n" + "addvl x24, x24, #2\n" + "add x13, x13, #0x8\n" "blt 7b\n" "8:" // K loop: Main loop: Second: Tail - "mov x22, %x[in]\n" - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01922a0 // ld1b { za0h.b[x13] }, p0/Z, [x21, x25]\n" - ".inst 0x25656140 // dup p0.b, p8.b/Z, p10.b[w13, #4]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0192284 // ld1b { za0h.b[x13, #4] }, p0/Z, [x20, x25]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0xc0828510 // mova z16.s, p1/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x23, %x[width]\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x25]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0xe01922a4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x21, x25]\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" + "udot z17.s, z16.b, z18.b\n" + "mov x23, %x[in]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0828930 // mova z16.s, p2/M, za2v.s[x12, #1]\n" + "ldr x21, [x23, #0x8]\n" + ".inst 0xe0bf8708 // st1w { za2v.s[x12] }, p1/Z, [x24, XZR, LSL #2]\n" + "whilelt p9.b, x28, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x23, x23, #0x10\n" + ".inst 0xe0a98309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x9, LSL #2]\n" "udot z17.s, z16.b, z18.b\n" - ".inst 0xc0828530 // mova z16.s, p1/M, za2v.s[x12, #1]\n" "addvl x24, x24, #2\n" - "incb x23\n" + "incb x28\n" "incb x25\n" - "udot z17.s, z16.b, z18.b\n" - "subs x19, x19, #0x1\n" "bgt 4b\n" "9:" // K loop: Tails - "cbnz x28, 12f\n" - "mov x22, %x[in]\n" - "whilelt p8.b, x23, %x[width]\n" + "cbnz x27, 12f\n" + "mov x23, %x[in]\n" + "whilelt p8.b, x28, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: First - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - "addvl x24, x24, #1\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe01922a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x25]\n" + "ldr x22, [x23, #0x0]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" + "add x12, x12, #0x1\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" "udot z17.s, z16.b, z18.b\n" - "add x22, x22, #0x8\n" + ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + "cmp x12, x9\n" + "add x23, x23, #0x8\n" + "addvl x24, x24, #1\n" "add x13, x13, #0x4\n" - "add x12, x12, #0x1\n" - "cmp x12, x10\n" "blt 10b\n" - "whilelt p9.b, x23, %x[width]\n" - "whilelt p8.b, x23, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "11:" // K loop: Tails: Even: Second - ".inst 0xc0828510 // mova z16.s, p1/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" - "add x19, x19, #0x4\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" "add x12, x12, #0x1\n" + "cmp x12, x10\n" "udot z17.s, z16.b, z18.b\n" - "cmp x12, x27\n" + "addvl x24, x24, #1\n" + "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x23, %x[width]\n" + "whilelt p9.b, x28, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" "13:" // K loop: Tails: Odd: Loop - ".inst 0xc0828410 // mova z16.s, p1/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" + "cmp x12, x10\n" "udot z17.s, z16.b, z18.b\n" + "addvl x24, x24, #1\n" "blt 13b\n" "14:" // K loop: End - "st1w { z17.s }, p1, [x24]\n" + "st1w { z17.s }, p2, [x24]\n" "addvl x24, x24, #1\n" "mov %x[out], x24\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "x9", "x10", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp index f80ca640ff..788c1a2eca 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,175 +34,175 @@ void interleave_block<1, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "mov x21, %x[width]\n" + "inch x21\n" + "cnth x11\n" + "sub x21, x21, #0x1\n" + "udiv x21, x21, x11\n" // n_passes = ceildiv(width, VL) "mov x20, %x[width]\n" - "inch x20\n" - "cnth x10\n" - "sub x20, x20, #0x1\n" - "udiv x20, x20, x10\n" // n_passes = ceildiv(width, VL) - "mov x19, %x[width]\n" - "sub x9, x10, #0x1\n" - "sub x28, x20, #0x1\n" - "ands x9, x19, x9\n" - "sub x27, x10, #0x2\n" - "lsl x19, %x[height], #0x1\n" // height * 2 - "mov x26, #0x0\n" - "mov x25, %x[in]\n" - "lsr x28, x28, #0x1\n" // n_loops = (n_passes - 1) / 2 - "ldr x24, [x25, #0x0]\n" - "and x23, x20, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "csel x9, x9, x10, NE\n" - "ldr x22, [x25, #0x8]\n" + "sub x10, x11, #0x1\n" + "sub x9, x21, #0x1\n" + "ands x10, x20, x10\n" + "sub x28, x11, #0x2\n" + "lsl x20, %x[height], #0x1\n" // height * 2 + "mov x27, #0x0\n" + "mov x26, %x[in]\n" + "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 + "ldr x25, [x26, #0x0]\n" + "and x24, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "csel x10, x10, x11, NE\n" + "ldr x23, [x26, #0x8]\n" "ptrue p11.h\n" - "whilelt p10.h, XZR, x19\n" - "mov x21, %x[row_offset]\n" - "mov x20, %x[out]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" - "add x25, x25, #0x10\n" + "whilelt p10.h, XZR, x20\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" + "add x26, x26, #0x10\n" "mov x12, #0x0\n" - "cbz x27, 2f\n" + "cbz x28, 2f\n" "1:" // K loop: Charge: Loop ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n" - "ldr x24, [x25, #0x0]\n" - ".inst 0xe05502c1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x21, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" + "cmp x12, x28\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386140 // psel p0.h, p8.h/Z, p10.h[w12, #1]\n" - "mov x25, %x[in]\n" - ".inst 0xe05502c1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x21, LSL #1]\n" - "ldr x24, [x25, #0x0]\n" - "inch x21\n" - "ldr x22, [x25, #0x8]\n" - "add x25, x25, #0x10\n" - "inch x26\n" - "cbz x28, 8f\n" - "mov x19, x28\n" + "mov x26, %x[in]\n" + ".inst 0xe05602e1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x23, x22, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" + "inch x22\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + "inch x27\n" + "cbz x9, 8f\n" + "mov x20, x9\n" "3:" // K loop: Main loop - "whilelt p8.h, x26, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" - "cbz x27, 5f\n" + "cbz x28, 5f\n" "4:" // K loop: Main loop: First: Loop ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550308 // ld1h { za1h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - ".inst 0xe06a8281 // st1h { za0v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" + ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550308 // ld1h { za1h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e9 // ld1h { za1h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - "whilelt p9.h, x26, %x[width]\n" - "inch x26\n" - "add x25, x25, #0x10\n" - ".inst 0xe06a8281 // st1h { za0v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" - "addvl x20, x20, #2\n" - "inch x21\n" - "whilelt p8.h, x26, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" + "inch x27\n" + "add x26, x26, #0x10\n" + ".inst 0xe06b82a1 // st1h { za0v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" + "addvl x21, x21, #2\n" + "inch x22\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" - "cbz x27, 7f\n" + "cbz x28, 7f\n" "6:" // K loop: Main loop: Second: Loop ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" - "ldr x24, [x25, #0x0]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8288 // st1h { za1v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - ".inst 0xe06a8289 // st1h { za1v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" + ".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27\n" - "add x25, x25, #0x10\n" - "addvl x20, x20, #2\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550300 // ld1h { za0h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" - "mov x25, %x[in]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe0560320 // ld1h { za0h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25386141 // psel p1.h, p8.h/Z, p10.h[w12, #1]\n" ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe05506c1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x21, LSL #1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0xe07f8288 // st1h { za1v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe05606e1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" ".inst 0x25386d20 // psel p0.h, p11.h/Z, p9.h[w12, #1]\n" - "whilelt p9.h, x26, %x[width]\n" - "subs x19, x19, #0x1\n" - "add x25, x25, #0x10\n" - ".inst 0xe06a8289 // st1h { za1v.h[x12, #1] }, p0/Z, [x20, x10, LSL #1]\n" - "addvl x20, x20, #2\n" - "inch x26\n" - "inch x21\n" + "whilelt p9.h, x27, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + ".inst 0xe06b82a9 // st1h { za1v.h[x12, #1] }, p0/Z, [x21, x11, LSL #1]\n" + "addvl x21, x21, #2\n" + "inch x27\n" + "inch x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x23, 11f\n" - "mov x25, %x[in]\n" - "whilelt p8.h, x26, %x[width]\n" + "cbnz x24, 11f\n" + "mov x26, %x[in]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" - "ldr x24, [x25, #0x0]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" + "ldr x25, [x26, #0x0]\n" ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0550308 // ld1h { za1h.h[x12] }, p0/Z, [x24, x21, LSL #1]\n" + ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" - "add x25, x25, #0x8\n" - "addvl x20, x20, #1\n" + "cmp x12, x11\n" + "add x26, x26, #0x8\n" + "addvl x21, x21, #1\n" "blt 9b\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe07f8288 // st1h { za1v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe07f82a8 // st1h { za1v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" "blt 10b\n" - "whilelt p9.h, x26, %x[width]\n" + "whilelt p9.h, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" - ".inst 0xe07f8280 // st1h { za0v.h[x12] }, p0/Z, [x20, XZR, LSL #1]\n" + ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x9\n" - "addvl x20, x20, #1\n" + "cmp x12, x10\n" + "addvl x21, x21, #1\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x20\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp index 874fc797a4..7de88543d7 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,174 +34,174 @@ void interleave_block<1, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "mov x22, %x[width]\n" + "incw x22\n" "cntw x10\n" - "mov x19, %x[width]\n" - "incw x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x10\n" // n_passes = ceildiv(width, VL) - "sub x9, x19, #0x1\n" - "lsr x9, x9, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x28, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x27, x10, #0x1\n" - "ands x27, x19, x27\n" - "csel x27, x27, x10, NE\n" - "sub x26, x10, #0x2\n" + "sub x22, x22, #0x1\n" + "udiv x22, x22, x10\n" // n_passes = ceildiv(width, VL) + "mov x21, %x[width]\n" + "sub x9, x10, #0x1\n" + "sub x20, x22, #0x1\n" + "ands x9, x21, x9\n" + "sub x28, x10, #0x2\n" + "mov x27, #0x0\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x24, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "ldr x23, [x26, #0x8]\n" + "csel x9, x9, x10, NE\n" "ptrue p11.s\n" "whilelt p10.s, XZR, %x[height]\n" - "mov x25, %x[row_offset]\n" - "mov x24, %x[out]\n" - "mov x23, #0x0\n" - "whilelt p9.s, x23, %x[width]\n" - "whilelt p8.s, x23, %x[width]\n" - "mov x22, %x[in]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.s, x27, %x[width]\n" + "whilelt p8.s, x27, %x[width]\n" + "add x26, x26, #0x10\n" "mov x12, #0x0\n" - "cbz x26, 2f\n" + "cbz x28, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe09902a0 // ld1w { za0h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - ".inst 0x25706140 // dup p0.s, p8.s/Z, p10.s[w12, #1]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0990281 // ld1w { za0h.s[x12, #1] }, p0/Z, [x20, x25, LSL #2]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0x25706140 // psel p0.s, p8.s/Z, p10.s[w12, #1]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0xe09602e1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x26\n" + "cmp x12, x28\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe09902a0 // ld1w { za0h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - ".inst 0x25706140 // dup p0.s, p8.s/Z, p10.s[w12, #1]\n" - "mov x22, %x[in]\n" - ".inst 0xe0990281 // ld1w { za0h.s[x12, #1] }, p0/Z, [x20, x25, LSL #2]\n" - "ldr x21, [x22, #0x0]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - "incw x25\n" - "incw x23\n" - "cbz x9, 8f\n" - "mov x19, x9\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0x25706140 // psel p0.s, p8.s/Z, p10.s[w12, #1]\n" + "mov x26, %x[in]\n" + ".inst 0xe09602e1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n" + "ldr x25, [x26, #0x0]\n" + "incw x22\n" + "ldr x23, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + "incw x27\n" + "cbz x20, 8f\n" + "mov x20, x20\n" "3:" // K loop: Main loop - "whilelt p8.s, x23, %x[width]\n" + "whilelt p8.s, x27, %x[width]\n" "mov x12, #0x0\n" - "cbz x26, 5f\n" + "cbz x28, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe09902a8 // ld1w { za2h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - ".inst 0x25706140 // dup p0.s, p8.s/Z, p10.s[w12, #1]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0990289 // ld1w { za2h.s[x12, #1] }, p0/Z, [x20, x25, LSL #2]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe09606e9 // ld1w { za2h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x26\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x22, %x[in]\n" - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe09902a8 // ld1w { za2h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - ".inst 0x25706140 // dup p0.s, p8.s/Z, p10.s[w12, #1]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0990289 // ld1w { za2h.s[x12, #1] }, p0/Z, [x20, x25, LSL #2]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.s, x23, %x[width]\n" - ".inst 0xe0aa8301 // st1w { za0v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" - "incw x23\n" - "incw x25\n" - "whilelt p8.s, x23, %x[width]\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe09606e9 // ld1w { za2h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "whilelt p9.s, x27, %x[width]\n" + "incw x27\n" + "add x26, x26, #0x10\n" + ".inst 0xe0aa82a1 // st1w { za0v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" + "addvl x21, x21, #2\n" + "incw x22\n" + "whilelt p8.s, x27, %x[width]\n" "mov x12, #0x0\n" - "cbz x26, 7f\n" + "cbz x28, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe09902a0 // ld1w { za0h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - ".inst 0x25706140 // dup p0.s, p8.s/Z, p10.s[w12, #1]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0990281 // ld1w { za0h.s[x12, #1] }, p0/Z, [x20, x25, LSL #2]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe09606e1 // ld1w { za0h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x26\n" + "cmp x12, x28\n" + "add x26, x26, #0x10\n" + "addvl x21, x21, #2\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x22, %x[in]\n" - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe09902a0 // ld1w { za0h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - ".inst 0x25706140 // dup p0.s, p8.s/Z, p10.s[w12, #1]\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe0990281 // ld1w { za0h.s[x12, #1] }, p0/Z, [x20, x25, LSL #2]\n" - "ldr x20, [x22, #0x8]\n" - "add x22, x22, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.s, x23, %x[width]\n" - ".inst 0xe0aa8309 // st1w { za2v.s[x12, #1] }, p0/Z, [x24, x10, LSL #2]\n" - "addvl x24, x24, #2\n" - "incw x23\n" - "incw x25\n" - "subs x19, x19, #0x1\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + "mov x26, %x[in]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25706141 // psel p1.s, p8.s/Z, p10.s[w12, #1]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe09606e1 // ld1w { za0h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + "ldr x23, [x26, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "whilelt p9.s, x27, %x[width]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + ".inst 0xe0aa82a9 // st1w { za2v.s[x12, #1] }, p0/Z, [x21, x10, LSL #2]\n" + "addvl x21, x21, #2\n" + "incw x27\n" + "incw x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x28, 11f\n" - "mov x22, %x[in]\n" - "whilelt p8.s, x23, %x[width]\n" + "cbnz x24, 11f\n" + "mov x26, %x[in]\n" + "whilelt p8.s, x27, %x[width]\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - ".inst 0x25306140 // dup p0.s, p8.s/Z, p10.s[w12]\n" - "addvl x24, x24, #1\n" - "ldr x21, [x22, #0x0]\n" - ".inst 0xe09902a8 // ld1w { za2h.s[x12] }, p0/Z, [x21, x25, LSL #2]\n" - "add x22, x22, #0x8\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x25, [x26, #0x0]\n" + ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" + ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x10\n" + "add x26, x26, #0x8\n" + "addvl x21, x21, #1\n" "blt 9b\n" - "whilelt p9.s, x23, %x[width]\n" - "whilelt p8.s, x23, %x[width]\n" + "whilelt p9.s, x27, %x[width]\n" + "whilelt p8.s, x27, %x[width]\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8308 // st1w { za2v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" + "cmp x12, x9\n" + "addvl x21, x21, #1\n" "blt 10b\n" - "whilelt p9.s, x23, %x[width]\n" + "whilelt p9.s, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "addvl x24, x24, #1\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x27\n" + "cmp x12, x9\n" + "addvl x21, x21, #1\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x24\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p8", "p9", "p10", "p11", "x9", "x10", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp index 61fed43394..14ee5d6304 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,66 +34,66 @@ void interleave_block<2, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cnth x28\n" + "cmp %x[height], x28\n" "cnth x27\n" - "cmp %x[height], x27\n" - "cnth x26\n" - "csel x27, %x[height], x27, LT\n" - "mov x25, #0x0\n" + "csel x28, %x[height], x28, LT\n" + "mov x26, #0x0\n" "ptrue p13.s\n" - "sub x27, x27, #0x1\n" + "sub x28, x28, #0x1\n" "whilelt p12.h, XZR, %x[height]\n" - "whilelt p11.h, x26, %x[height]\n" - "mov x24, %x[row_offset]\n" - "mov x23, %x[out]\n" - "whilelt p10.h, x25, %x[width]\n" - "whilelt p9.h, x25, %x[width]\n" - "whilelt p8.h, x25, %x[width]\n" + "whilelt p11.h, x27, %x[height]\n" + "mov x25, %x[row_offset]\n" + "mov x24, %x[out]\n" + "whilelt p10.h, x26, %x[width]\n" + "whilelt p9.h, x26, %x[width]\n" + "whilelt p8.h, x26, %x[width]\n" "1:" // Width loop - "add x22, %x[in], XZR, LSL #3\n" - "add x19, %x[in], x26, LSL #3\n" - "ldr x21, [x22], #0x8\n" + "add x23, %x[in], XZR, LSL #3\n" + "add x20, %x[in], x27, LSL #3\n" + "ldr x22, [x23], #0x8\n" "mov x12, #0x0\n" - "ldr x20, [x19], #0x8\n" - "cbz x27, 3f\n" + "ldr x21, [x20], #0x8\n" + "cbz x28, 3f\n" "2:" // Loads: Loop ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe05806a0 // ld1h { za0h.h[x12] }, p1/Z, [x21, x24, LSL #1]\n" - "ldr x21, [x22], #0x8\n" - ".inst 0xe0580288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x24, LSL #1]\n" + ".inst 0xe05906c0 // ld1h { za0h.h[x12] }, p1/Z, [x22, x25, LSL #1]\n" + "ldr x22, [x23], #0x8\n" + ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27, LSL #1\n" - "ldr x20, [x19], #0x8\n" + "cmp x12, x28, LSL #1\n" + "ldr x21, [x20], #0x8\n" "blt 2b\n" "3:" // Loads: Tail - "sub x19, %x[width], x25\n" + "sub x20, %x[width], x26\n" ".inst 0x25286580 // psel p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe05802a0 // ld1h { za0h.h[x12] }, p0/Z, [x21, x24, LSL #1]\n" + ".inst 0xe05902c0 // ld1h { za0h.h[x12] }, p0/Z, [x22, x25, LSL #1]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - "cmp x19, x26\n" - ".inst 0xe0580288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x24, LSL #1]\n" + "cmp x20, x27\n" + ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n" "mov x12, #0x0\n" - "csel x19, x19, x26, LT\n" + "csel x20, x20, x27, LT\n" "4:" // Stores: Loop ".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n" - ".inst 0xe07f82e0 // st1h { za0v.h[x12] }, p0/Z, [x23, XZR, LSL #1]\n" + ".inst 0xe07f8300 // st1h { za0v.h[x12] }, p0/Z, [x24, XZR, LSL #1]\n" ".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n" - ".inst 0xe07a82e8 // st1h { za1v.h[x12] }, p0/Z, [x23, x26, LSL #1]\n" + ".inst 0xe07b8308 // st1h { za1v.h[x12] }, p0/Z, [x24, x27, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "addvl x23, x23, #4\n" + "cmp x12, x20\n" + "addvl x24, x24, #4\n" "blt 4b\n" + "inch x26\n" + "whilelt p10.h, x26, %x[width]\n" + "whilelt p9.h, x26, %x[width]\n" + "whilelt p8.h, x26, %x[width]\n" "inch x25\n" - "whilelt p10.h, x25, %x[width]\n" - "whilelt p9.h, x25, %x[width]\n" - "whilelt p8.h, x25, %x[width]\n" - "inch x24\n" "b.any 1b\n" - "mov %x[out], x23\n" + "mov %x[out], x24\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp index fc7596e67b..f648ccf771 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,269 +34,269 @@ void interleave_block<2, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cnth x20\n" + "cnth x21\n" + "mov x22, %x[width]\n" + "inch x22\n" + "mov x20, %x[width]\n" + "sub x17, x21, #0x1\n" + "sub x22, x22, #0x1\n" + "ands x17, x20, x17\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "mov x19, %x[width]\n" - "inch x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x20, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x20, NE\n" - "add x10, x10, #0x1\n" - "lsr x10, x10, #0x1\n" - "sub x9, x16, #0x2\n" + "udiv x22, x22, x21\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x21, NE\n" + "sub x13, x22, #0x1\n" + "add x17, x17, #0x1\n" + "sub x15, x16, #0x2\n" + "lsl x21, %x[height], #0x1\n" // height * 2 + "lsl x20, x16, #0x1\n" + "mov x14, #0x0\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + "cntw x28, ALL, MUL #2\n" + "cntw x27, ALL, MUL #3\n" + "ldr x26, [x10, #0x0]\n" + "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "ldr x24, [x11, #0x8]\n" + "lsr x17, x17, #0x1\n" "ptrue p13.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x16, #0x1\n" - "whilelt p12.h, XZR, x20\n" - "whilelt p11.h, x19, x20\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p10.h, x26, %x[width]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + "ldr x23, [x10, #0x8]\n" + "whilelt p12.h, XZR, x21\n" + "whilelt p11.h, x20, x21\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p10.h, x14, %x[width]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" "mov x12, #0x0\n" - "cbz x9, 2f\n" + "cbz x15, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25286580 // dup p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe05c02e0 // ld1h { za0h.h[x12] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25286160 // dup p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe05c02c8 // ld1h { za1h.h[x12] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25686580 // dup p0.h, p9.h/Z, p12.h[w12, #2]\n" - ".inst 0xe05c02a2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25686160 // dup p0.h, p8.h/Z, p11.h[w12, #2]\n" - ".inst 0xe05c028a // ld1h { za1h.h[x12, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" + ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" + ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" + ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" "add x12, x12, #0x4\n" - "cmp x12, x9, LSL #1\n" + "cmp x12, x15, LSL #1\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25286580 // dup p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe05c02e0 // ld1h { za0h.h[x12] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25286160 // dup p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe05c02c8 // ld1h { za1h.h[x12] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25686580 // dup p0.h, p9.h/Z, p12.h[w12, #2]\n" - ".inst 0xe05c02a2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25686160 // dup p0.h, p8.h/Z, p11.h[w12, #2]\n" - ".inst 0xe05c028a // ld1h { za1h.h[x12, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "inch x28\n" - "inch x26\n" + ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" + ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" + ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" + ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + "ldr x26, [x10, #0x0]\n" + "inch x22\n" + "inch x14\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" "cbz x13, 8f\n" - "mov x19, x13\n" + "mov x20, x13\n" "3:" // K loop: Main loop - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 5f\n" + "cbz x15, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25396580 // dup p0.h, p9.h/Z, p12.h[w13, #1]\n" - ".inst 0xe05c22e1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25396160 // dup p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe05c22c9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25796580 // dup p0.h, p9.h/Z, p12.h[w13, #3]\n" - ".inst 0xe05c22a3 // ld1h { za0h.h[x13, #3] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25796160 // dup p0.h, p8.h/Z, p11.h[w13, #3]\n" - ".inst 0xe05c228b // ld1h { za1h.h[x13, #3] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" + ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" + ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" + ".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562aeb // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25396580 // dup p0.h, p9.h/Z, p12.h[w13, #1]\n" - ".inst 0xe05c22e1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25396160 // dup p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe05c22c9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25796580 // dup p0.h, p9.h/Z, p12.h[w13, #3]\n" - ".inst 0xe05c22a3 // ld1h { za0h.h[x13, #3] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25796160 // dup p0.h, p8.h/Z, p11.h[w13, #3]\n" - ".inst 0xe05c228b // ld1h { za1h.h[x13, #3] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - "whilelt p10.h, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "inch x26\n" - "inch x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" + ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" + ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" + ".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n" + ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe05626eb // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p10.h, x14, %x[width]\n" + "inch x14\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "inch x22\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 7f\n" + "cbz x15, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25296580 // dup p0.h, p9.h/Z, p12.h[w13]\n" - ".inst 0xe05c22e0 // ld1h { za0h.h[x13] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25296160 // dup p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe05c22c8 // ld1h { za1h.h[x13] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25696580 // dup p0.h, p9.h/Z, p12.h[w13, #2]\n" - ".inst 0xe05c22a2 // ld1h { za0h.h[x13, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25696160 // dup p0.h, p8.h/Z, p11.h[w13, #2]\n" - ".inst 0xe05c228a // ld1h { za1h.h[x13, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" + ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" + ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" + ".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562aea // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25296580 // dup p0.h, p9.h/Z, p12.h[w13]\n" - ".inst 0xe05c22e0 // ld1h { za0h.h[x13] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25296160 // dup p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe05c22c8 // ld1h { za1h.h[x13] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25696580 // dup p0.h, p9.h/Z, p12.h[w13, #2]\n" - ".inst 0xe05c22a2 // ld1h { za0h.h[x13, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25696160 // dup p0.h, p8.h/Z, p11.h[w13, #2]\n" - ".inst 0xe05c228a // ld1h { za1h.h[x13, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - "whilelt p10.h, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "inch x26\n" - "inch x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" + ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" + ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" + ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n" + ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe05626ea // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p10.h, x14, %x[width]\n" + "subs x20, x20, #0x1\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "inch x14\n" + "inch x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x11, 11f\n" - "mov x25, %x[in]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + "cbnz x25, 11f\n" + "mov x11, %x[in]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25396581 // dup p1.h, p9.h/Z, p12.h[w13, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25396160 // dup p0.h, p8.h/Z, p11.h[w13, #1]\n" - "addvl x27, x27, #2\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe05c26e1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x23, x28, LSL #1]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" - ".inst 0xe05c22c9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x22, x28, LSL #1]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + "ldr x9, [x11, #0x0]\n" "add x12, x12, #0x1\n" - "add x25, x25, #0x8\n" - "add x13, x13, #0x2\n" + ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" + "ldr x26, [x11, x16, LSL #0x3]\n" + ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" "cmp x12, x16\n" + ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + "add x11, x11, #0x8\n" + "addvl x21, x21, #2\n" + "add x13, x13, #0x2\n" "blt 9b\n" - "whilelt p10.h, x26, %x[width]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p10.h, x14, %x[width]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - "add x19, x19, #0x2\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" + "add x20, x20, #0x2\n" "blt 10b\n" - "whilelt p10.h, x26, %x[width]\n" + "whilelt p10.h, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x27\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "p12", "p13", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp index 67570a1302..61536d38a5 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,269 +34,269 @@ void interleave_block<2, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cnth x20\n" + "cnth x21\n" + "mov x22, %x[width]\n" + "inch x22\n" + "mov x20, %x[width]\n" + "sub x17, x21, #0x1\n" + "sub x22, x22, #0x1\n" + "ands x17, x20, x17\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "mov x19, %x[width]\n" - "inch x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x20, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x20, NE\n" - "add x10, x10, #0x1\n" - "lsr x10, x10, #0x1\n" - "sub x9, x16, #0x2\n" + "udiv x22, x22, x21\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x21, NE\n" + "sub x13, x22, #0x1\n" + "add x17, x17, #0x1\n" + "sub x15, x16, #0x2\n" + "lsl x21, %x[height], #0x1\n" // height * 2 + "lsl x20, x16, #0x1\n" + "mov x14, #0x0\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + "cntw x28, ALL, MUL #2\n" + "cntw x27, ALL, MUL #3\n" + "ldr x26, [x10, #0x0]\n" + "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "ldr x24, [x11, #0x8]\n" + "lsr x17, x17, #0x1\n" "ptrue p13.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x16, #0x1\n" - "whilelt p12.h, XZR, x20\n" - "whilelt p11.h, x19, x20\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p10.h, x26, %x[width]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + "ldr x23, [x10, #0x8]\n" + "whilelt p12.h, XZR, x21\n" + "whilelt p11.h, x20, x21\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p10.h, x14, %x[width]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" "mov x12, #0x0\n" - "cbz x9, 2f\n" + "cbz x15, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25286580 // dup p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe05c02e0 // ld1h { za0h.h[x12] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25286160 // dup p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe05c02c8 // ld1h { za1h.h[x12] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25686580 // dup p0.h, p9.h/Z, p12.h[w12, #2]\n" - ".inst 0xe05c02a2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25686160 // dup p0.h, p8.h/Z, p11.h[w12, #2]\n" - ".inst 0xe05c028a // ld1h { za1h.h[x12, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" + ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" + ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" + ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" "add x12, x12, #0x4\n" - "cmp x12, x9, LSL #1\n" + "cmp x12, x15, LSL #1\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25286580 // dup p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe05c02e0 // ld1h { za0h.h[x12] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25286160 // dup p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe05c02c8 // ld1h { za1h.h[x12] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25686580 // dup p0.h, p9.h/Z, p12.h[w12, #2]\n" - ".inst 0xe05c02a2 // ld1h { za0h.h[x12, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25686160 // dup p0.h, p8.h/Z, p11.h[w12, #2]\n" - ".inst 0xe05c028a // ld1h { za1h.h[x12, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "inch x28\n" - "inch x26\n" + ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" + ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" + ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" + ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + "ldr x26, [x10, #0x0]\n" + "inch x22\n" + "inch x14\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" "cbz x13, 8f\n" - "mov x19, x13\n" + "mov x20, x13\n" "3:" // K loop: Main loop - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 5f\n" + "cbz x15, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25396580 // dup p0.h, p9.h/Z, p12.h[w13, #1]\n" - ".inst 0xe05c22e1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25396160 // dup p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe05c22c9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25796580 // dup p0.h, p9.h/Z, p12.h[w13, #3]\n" - ".inst 0xe05c22a3 // ld1h { za0h.h[x13, #3] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25796160 // dup p0.h, p8.h/Z, p11.h[w13, #3]\n" - ".inst 0xe05c228b // ld1h { za1h.h[x13, #3] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" + ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" + ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" + ".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562aeb // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25396580 // dup p0.h, p9.h/Z, p12.h[w13, #1]\n" - ".inst 0xe05c22e1 // ld1h { za0h.h[x13, #1] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25396160 // dup p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe05c22c9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25796580 // dup p0.h, p9.h/Z, p12.h[w13, #3]\n" - ".inst 0xe05c22a3 // ld1h { za0h.h[x13, #3] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25796160 // dup p0.h, p8.h/Z, p11.h[w13, #3]\n" - ".inst 0xe05c228b // ld1h { za1h.h[x13, #3] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - "whilelt p10.h, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "inch x26\n" - "inch x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" + ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" + ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" + ".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n" + ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe05626eb // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p10.h, x14, %x[width]\n" + "inch x14\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "inch x22\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 7f\n" + "cbz x15, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25296580 // dup p0.h, p9.h/Z, p12.h[w13]\n" - ".inst 0xe05c22e0 // ld1h { za0h.h[x13] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25296160 // dup p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe05c22c8 // ld1h { za1h.h[x13] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25696580 // dup p0.h, p9.h/Z, p12.h[w13, #2]\n" - ".inst 0xe05c22a2 // ld1h { za0h.h[x13, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25696160 // dup p0.h, p8.h/Z, p11.h[w13, #2]\n" - ".inst 0xe05c228a // ld1h { za1h.h[x13, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" + ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" + ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" + ".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0562aea // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25296580 // dup p0.h, p9.h/Z, p12.h[w13]\n" - ".inst 0xe05c22e0 // ld1h { za0h.h[x13] }, p0/Z, [x23, x28, LSL #1]\n" - ".inst 0x25296160 // dup p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe05c22c8 // ld1h { za1h.h[x13] }, p0/Z, [x22, x28, LSL #1]\n" - ".inst 0x25696580 // dup p0.h, p9.h/Z, p12.h[w13, #2]\n" - ".inst 0xe05c22a2 // ld1h { za0h.h[x13, #2] }, p0/Z, [x21, x28, LSL #1]\n" - ".inst 0x25696160 // dup p0.h, p8.h/Z, p11.h[w13, #2]\n" - ".inst 0xe05c228a // ld1h { za1h.h[x13, #2] }, p0/Z, [x20, x28, LSL #1]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - "whilelt p10.h, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "inch x26\n" - "inch x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" + ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" + ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" + ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n" + ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe05626ea // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p10.h, x14, %x[width]\n" + "subs x20, x20, #0x1\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "inch x14\n" + "inch x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x11, 11f\n" - "mov x25, %x[in]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" + "cbnz x25, 11f\n" + "mov x11, %x[in]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25396581 // dup p1.h, p9.h/Z, p12.h[w13, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25396160 // dup p0.h, p8.h/Z, p11.h[w13, #1]\n" - "addvl x27, x27, #2\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe05c26e1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x23, x28, LSL #1]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" - ".inst 0xe05c22c9 // ld1h { za1h.h[x13, #1] }, p0/Z, [x22, x28, LSL #1]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + "ldr x9, [x11, #0x0]\n" "add x12, x12, #0x1\n" - "add x25, x25, #0x8\n" - "add x13, x13, #0x2\n" + ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" + "ldr x26, [x11, x16, LSL #0x3]\n" + ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" "cmp x12, x16\n" + ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + "add x11, x11, #0x8\n" + "addvl x21, x21, #2\n" + "add x13, x13, #0x2\n" "blt 9b\n" - "whilelt p10.h, x26, %x[width]\n" - "whilelt p9.h, x26, %x[width]\n" - "whilelt p8.h, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p10.h, x14, %x[width]\n" + "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - "add x19, x19, #0x2\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" + "add x20, x20, #0x2\n" "blt 10b\n" - "whilelt p10.h, x26, %x[width]\n" + "whilelt p10.h, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x27\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "p12", "p13", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp index 22f09339b2..4c701cff19 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,265 +34,265 @@ void interleave_block<2, 4, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cntb x20\n" + "cntb x21\n" + "mov x23, %x[width]\n" + "incb x23\n" + "mov x20, %x[width]\n" + "sub x17, x21, #0x1\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "mov x19, %x[width]\n" - "incb x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x20, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x20, NE\n" - "add x10, x10, #0x3\n" - "lsr x10, x10, #0x2\n" - "sub x9, x16, #0x2\n" + "sub x23, x23, #0x1\n" + "ands x17, x20, x17\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x16, #0x1\n" + "sub x20, x23, #0x1\n" + "add x17, x17, #0x3\n" + "sub x15, x16, #0x2\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x14, #0x0\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + "cntw x28, ALL, MUL #2\n" + "cntw x27, ALL, MUL #3\n" + "ldr x26, [x10, #0x0]\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x25, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "ldr x24, [x11, #0x8]\n" + "lsr x17, x17, #0x2\n" "ptrue p11.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x16, #0x1\n" - "whilelt p9.b, XZR, x20\n" - "whilelt p8.b, x19, x20\n" + "ldr x23, [x10, #0x8]\n" "zip1 p10.b, p9.b, p8.b\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.b, x14, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" "mov x12, #0x0\n" - "cbz x9, 2f\n" + "cbz x15, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n" "add x12, x12, #0x8\n" - "cmp x12, x9, LSL #2\n" + "cmp x12, x15, LSL #2\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" - "mov x25, %x[in]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "incb x28\n" - "incb x26\n" - "cbz x13, 8f\n" - "mov x19, x13\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" + ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n" + "ldr x26, [x10, #0x0]\n" + "incb x22\n" + "incb x14\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" + "cbz x20, 8f\n" + "mov x20, x20\n" "3:" // K loop: Main loop - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 5f\n" + "cbz x15, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0162ae7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x23, x22]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x8\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "incb x26\n" - "incb x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "whilelt p8.b, x26, %x[width]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + "mov x11, %x[in]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n" + "add x10, %x[in], x16, LSL #3\n" + ".inst 0x257d6141 // psel p1.b, p8.b/Z, p10.b[w13, #7]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01626e7 // ld1b { za0h.b[x13, #7] }, p1/Z, [x23, x22]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p9.b, x14, %x[width]\n" + "incb x14\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "incb x22\n" + "whilelt p8.b, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 7f\n" + "cbz x15, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0162ae5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x23, x22]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x8\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "incb x26\n" - "incb x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + "mov x11, %x[in]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n" + "add x10, %x[in], x16, LSL #3\n" + ".inst 0x256d6141 // psel p1.b, p8.b/Z, p10.b[w13, #5]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01626e5 // ld1b { za0h.b[x13, #5] }, p1/Z, [x23, x22]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p9.b, x14, %x[width]\n" + "subs x20, x20, #0x1\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "incb x14\n" + "incb x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x11, 11f\n" - "mov x25, %x[in]\n" - "whilelt p8.b, x26, %x[width]\n" + "cbnz x25, 11f\n" + "mov x11, %x[in]\n" + "whilelt p8.b, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - "addvl x27, x27, #2\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26e2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x23, x28]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - "add x25, x25, #0x8\n" - "add x13, x13, #0x4\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" + "ldr x26, [x11, x16, LSL #0x3]\n" "add x12, x12, #0x1\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" "cmp x12, x16\n" + "add x11, x11, #0x8\n" + "addvl x21, x21, #2\n" + "add x13, x13, #0x4\n" "blt 9b\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x14, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "add x19, x19, #0x4\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" + "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x26, %x[width]\n" + "whilelt p9.b, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x27\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp index 81cde6c8ee..25262d3db9 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -32,321 +32,321 @@ void interleave_block<2, 4, VLType::SME, true>( { __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntb x21\n" + "mov x23, %x[width]\n" "mov z20.b, #0x1\n" + "incb x23\n" + "mov x20, %x[width]\n" "mov z19.s, #0x0\n" - "cntb x20\n" "mov z18.s, #0x0\n" + "sub x17, x21, #0x1\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "ptrue p2.b\n" - "mov x19, %x[width]\n" - "incb x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x20, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x20, NE\n" - "add x10, x10, #0x3\n" - "lsr x10, x10, #0x2\n" + "sub x23, x23, #0x1\n" + "ands x17, x20, x17\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x16, #0x1\n" + "sub x20, x23, #0x1\n" + "add x17, x17, #0x3\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x15, #0x0\n" + "cntw x14, ALL, MUL #2\n" + "cntw x11, ALL, MUL #3\n" + "ptrue p4.b\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x10, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "lsr x17, x17, #0x2\n" "sub x9, x16, #0x2\n" "ptrue p11.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x16, #0x1\n" - "whilelt p9.b, XZR, x20\n" - "whilelt p8.b, x19, x20\n" "zip1 p10.b, p9.b, p8.b\n" "mov x28, %x[row_offset]\n" "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" + "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "cbnz %x[first], 1f\n" "addvl x27, x27, #-2\n" - "ld1w { z19.s }, p2/Z, [x27]\n" - "ld1w { z18.s }, p2/Z, [x27, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x27]\n" + "ld1w { z18.s }, p4/Z, [x27, #1, MUL VL]\n" "1:" // K loop: Load row sums: End - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" + "mov x12, #0x0\n" + "ldr x22, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "mov x12, #0x0\n" "cbz x9, 3f\n" "2:" // K loop: Charge: Loop - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n" + "ldr x22, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n" "add x12, x12, #0x8\n" "cmp x12, x9, LSL #2\n" + "ldr x21, [x25, #0x8]\n" + "add x25, x25, #0x10\n" "blt 2b\n" "3:" // K loop: Charge: End - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" - "mov x25, %x[in]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "add x24, %x[in], x16, LSL #3\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" + ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x22, [x24, #0x0]\n" + "incb x28\n" + "incb x15\n" + "ldr x22, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "incb x28\n" - "incb x26\n" - "cbz x13, 9f\n" - "mov x19, x13\n" + "cbz x20, 9f\n" + "mov x20, x20\n" "4:" // K loop: Main loop - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x9, 6f\n" "5:" // K loop: Main loop: First: Loop - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" + ".inst 0xe01c22c6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x22, x28]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "sdot z19.s, z17.b, z20.b\n" - ".inst 0xc0828831 // mova z17.s, p2/M, za0v.s[x12, #1]\n" - "sdot z18.s, z16.b, z20.b\n" - ".inst 0xc08288b0 // mova z16.s, p2/M, za1v.s[x12, #1]\n" + ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "sdot z19.s, z16.b, z20.b\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "sdot z19.s, z17.b, z20.b\n" - "sdot z18.s, z16.b, z20.b\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "sdot z18.s, z17.b, z20.b\n" + ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" + ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "sdot z19.s, z16.b, z20.b\n" + "sdot z18.s, z17.b, z20.b\n" + "addvl x27, x27, #4\n" + "add x13, x13, #0x8\n" "blt 5b\n" "6:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0x257d6140 // psel p0.b, p8.b/Z, p10.b[w13, #7]\n" + ".inst 0xe01c26c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x28]\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" + "sdot z19.s, z16.b, z20.b\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "sdot z18.s, z17.b, z20.b\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" + ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" + "whilelt p9.b, x15, %x[width]\n" + ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" + "incb x15\n" + "add x26, x26, #0x10\n" + "sdot z19.s, z16.b, z20.b\n" + ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "sdot z19.s, z17.b, z20.b\n" - ".inst 0xc0828831 // mova z17.s, p2/M, za0v.s[x12, #1]\n" - "sdot z18.s, z16.b, z20.b\n" - ".inst 0xc08288b0 // mova z16.s, p2/M, za1v.s[x12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "sdot z19.s, z17.b, z20.b\n" - "incb x26\n" - "sdot z18.s, z16.b, z20.b\n" + "sdot z18.s, z17.b, z20.b\n" "incb x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x9, 8f\n" "7:" // K loop: Main loop: Second: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" + ".inst 0xe01c22c4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x22, x28]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "sdot z19.s, z17.b, z20.b\n" - ".inst 0xc0828931 // mova z17.s, p2/M, za2v.s[x12, #1]\n" - "sdot z18.s, z16.b, z20.b\n" - ".inst 0xc08289b0 // mova z16.s, p2/M, za3v.s[x12, #1]\n" + ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + "sdot z19.s, z16.b, z20.b\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "sdot z19.s, z17.b, z20.b\n" - "sdot z18.s, z16.b, z20.b\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "sdot z18.s, z17.b, z20.b\n" + ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" + ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "sdot z19.s, z16.b, z20.b\n" + "sdot z18.s, z17.b, z20.b\n" + "addvl x27, x27, #4\n" + "add x13, x13, #0x8\n" "blt 7b\n" "8:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n" + ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" + ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" + "sdot z19.s, z16.b, z20.b\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + "sdot z18.s, z17.b, z20.b\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" + ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" + "whilelt p9.b, x15, %x[width]\n" + ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" + "sdot z19.s, z16.b, z20.b\n" + ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "sdot z19.s, z17.b, z20.b\n" - ".inst 0xc0828931 // mova z17.s, p2/M, za2v.s[x12, #1]\n" - "sdot z18.s, z16.b, z20.b\n" - ".inst 0xc08289b0 // mova z16.s, p2/M, za3v.s[x12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "sdot z19.s, z17.b, z20.b\n" - "incb x26\n" - "sdot z18.s, z16.b, z20.b\n" - "incb x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + "sdot z18.s, z17.b, z20.b\n" + "incb x15\n" + ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + "incb x28\n" "bgt 4b\n" "9:" // K loop: Tails - "cbnz x11, 12f\n" - "mov x25, %x[in]\n" - "whilelt p8.b, x26, %x[width]\n" + "cbnz x10, 12f\n" + "mov x26, %x[in]\n" + "whilelt p8.b, x15, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: First - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - "sdot z19.s, z17.b, z20.b\n" - "sdot z18.s, z16.b, z20.b\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26e2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x23, x28]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" - "addvl x27, x27, #2\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - "add x25, x25, #0x8\n" - "add x13, x13, #0x4\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "ldr x23, [x26, x16, LSL #0x3]\n" + ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" "add x12, x12, #0x1\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" "cmp x12, x16\n" + "sdot z19.s, z16.b, z20.b\n" + "sdot z18.s, z17.b, z20.b\n" + ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + "add x26, x26, #0x8\n" + "addvl x27, x27, #2\n" + "add x13, x13, #0x4\n" "blt 10b\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "11:" // K loop: Tails: Even: Second - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "add x19, x19, #0x4\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" - "sdot z19.s, z17.b, z20.b\n" - "sdot z18.s, z16.b, z20.b\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "sdot z19.s, z16.b, z20.b\n" + "sdot z18.s, z17.b, z20.b\n" + "addvl x27, x27, #2\n" + "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x26, %x[width]\n" + "whilelt p9.b, x15, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" "13:" // K loop: Tails: Odd: Loop - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" - "sdot z19.s, z17.b, z20.b\n" - "sdot z18.s, z16.b, z20.b\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "sdot z19.s, z16.b, z20.b\n" + "sdot z18.s, z17.b, z20.b\n" + "addvl x27, x27, #2\n" "blt 13b\n" "14:" // K loop: End - "st1w { z19.s }, p2, [x27]\n" - "st1w { z18.s }, p2, [x27, #1, MUL VL]\n" + "st1w { z19.s }, p4, [x27]\n" + "st1w { z18.s }, p4, [x27, #1, MUL VL]\n" "addvl x27, x27, #2\n" "mov %x[out], x27\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p8", "p9", "p10", "p11", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp index cd4a76654b..683a315a96 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,265 +34,265 @@ void interleave_block<2, 4, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cntb x20\n" + "cntb x21\n" + "mov x23, %x[width]\n" + "incb x23\n" + "mov x20, %x[width]\n" + "sub x17, x21, #0x1\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "mov x19, %x[width]\n" - "incb x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x20, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x20, NE\n" - "add x10, x10, #0x3\n" - "lsr x10, x10, #0x2\n" - "sub x9, x16, #0x2\n" + "sub x23, x23, #0x1\n" + "ands x17, x20, x17\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x16, #0x1\n" + "sub x20, x23, #0x1\n" + "add x17, x17, #0x3\n" + "sub x15, x16, #0x2\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x14, #0x0\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + "cntw x28, ALL, MUL #2\n" + "cntw x27, ALL, MUL #3\n" + "ldr x26, [x10, #0x0]\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x25, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "ldr x24, [x11, #0x8]\n" + "lsr x17, x17, #0x2\n" "ptrue p11.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x16, #0x1\n" - "whilelt p9.b, XZR, x20\n" - "whilelt p8.b, x19, x20\n" + "ldr x23, [x10, #0x8]\n" "zip1 p10.b, p9.b, p8.b\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p9.b, x14, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" "mov x12, #0x0\n" - "cbz x9, 2f\n" + "cbz x15, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n" "add x12, x12, #0x8\n" - "cmp x12, x9, LSL #2\n" + "cmp x12, x15, LSL #2\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" - "mov x25, %x[in]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "incb x28\n" - "incb x26\n" - "cbz x13, 8f\n" - "mov x19, x13\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe0160120 // ld1b { za0h.b[x12] }, p0/Z, [x9, x22]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + ".inst 0xe0160341 // ld1b { za0h.b[x12, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" + ".inst 0xe0160704 // ld1b { za0h.b[x12, #4] }, p1/Z, [x24, x22]\n" + "mov x11, %x[in]\n" + "add x10, %x[in], x16, LSL #3\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe01602e5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x23, x22]\n" + "ldr x26, [x10, #0x0]\n" + "incb x22\n" + "incb x14\n" + "ldr x24, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + "ldr x23, [x10, #0x8]\n" + "add x10, x10, #0x10\n" + "cbz x20, 8f\n" + "mov x20, x20\n" "3:" // K loop: Main loop - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 5f\n" + "cbz x15, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0162ae7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x23, x22]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x8\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "incb x26\n" - "incb x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "whilelt p8.b, x26, %x[width]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + "mov x11, %x[in]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162306 // ld1b { za0h.b[x13, #6] }, p0/Z, [x24, x22]\n" + "add x10, %x[in], x16, LSL #3\n" + ".inst 0x257d6141 // psel p1.b, p8.b/Z, p10.b[w13, #7]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01626e7 // ld1b { za0h.b[x13, #7] }, p1/Z, [x23, x22]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p9.b, x14, %x[width]\n" + "incb x14\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "incb x22\n" + "whilelt p8.b, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" - "cbz x9, 7f\n" + "cbz x15, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0162ae5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x23, x22]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x10, x10, #0x10\n" "add x13, x13, #0x8\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x15\n" + "addvl x21, x21, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "incb x26\n" - "incb x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe0162120 // ld1b { za0h.b[x13] }, p0/Z, [x9, x22]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + ".inst 0xe0162341 // ld1b { za0h.b[x13, #1] }, p0/Z, [x26, x22]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + "mov x11, %x[in]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0xe0162304 // ld1b { za0h.b[x13, #4] }, p0/Z, [x24, x22]\n" + "add x10, %x[in], x16, LSL #3\n" + ".inst 0x256d6141 // psel p1.b, p8.b/Z, p10.b[w13, #5]\n" + "ldr x26, [x10, #0x0]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01626e5 // ld1b { za0h.b[x13, #5] }, p1/Z, [x23, x22]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p9.b, x14, %x[width]\n" + "subs x20, x20, #0x1\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + "addvl x21, x21, #4\n" + "incb x14\n" + "incb x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x11, 11f\n" - "mov x25, %x[in]\n" - "whilelt p8.b, x26, %x[width]\n" + "cbnz x25, 11f\n" + "mov x11, %x[in]\n" + "whilelt p8.b, x14, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - "addvl x27, x27, #2\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26e2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x23, x28]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - "add x25, x25, #0x8\n" - "add x13, x13, #0x4\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + "ldr x9, [x11, #0x0]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" + "ldr x26, [x11, x16, LSL #0x3]\n" "add x12, x12, #0x1\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" "cmp x12, x16\n" + "add x11, x11, #0x8\n" + "addvl x21, x21, #2\n" + "add x13, x13, #0x4\n" "blt 9b\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x14, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "add x19, x19, #0x4\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" + "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x26, %x[width]\n" + "whilelt p9.b, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x17\n" + "addvl x21, x21, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x27\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp index 5a71613feb..e7571f7da7 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -32,321 +32,321 @@ void interleave_block<2, 4, VLType::SME, true>( { __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntb x21\n" + "mov x23, %x[width]\n" "mov z20.b, #0x1\n" + "incb x23\n" + "mov x20, %x[width]\n" "mov z19.s, #0x0\n" - "cntb x20\n" "mov z18.s, #0x0\n" + "sub x17, x21, #0x1\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "ptrue p2.b\n" - "mov x19, %x[width]\n" - "incb x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x20\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x20, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x20, NE\n" - "add x10, x10, #0x3\n" - "lsr x10, x10, #0x2\n" + "sub x23, x23, #0x1\n" + "ands x17, x20, x17\n" + "udiv x23, x23, x21\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x21, NE\n" + "lsl x22, %x[height], #0x1\n" // height * 2 + "lsl x21, x16, #0x1\n" + "sub x20, x23, #0x1\n" + "add x17, x17, #0x3\n" + "whilelt p9.b, XZR, x22\n" + "whilelt p8.b, x21, x22\n" + "mov x15, #0x0\n" + "cntw x14, ALL, MUL #2\n" + "cntw x11, ALL, MUL #3\n" + "ptrue p4.b\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "and x10, x23, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "lsr x17, x17, #0x2\n" "sub x9, x16, #0x2\n" "ptrue p11.s\n" - "lsl x20, %x[height], #0x1\n" // height * 2 - "lsl x19, x16, #0x1\n" - "whilelt p9.b, XZR, x20\n" - "whilelt p8.b, x19, x20\n" "zip1 p10.b, p9.b, p8.b\n" "mov x28, %x[row_offset]\n" "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" + "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "cbnz %x[first], 1f\n" "addvl x27, x27, #-2\n" - "ld1w { z19.s }, p2/Z, [x27]\n" - "ld1w { z18.s }, p2/Z, [x27, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x27]\n" + "ld1w { z18.s }, p4/Z, [x27, #1, MUL VL]\n" "1:" // K loop: Load row sums: End - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" + "mov x12, #0x0\n" + "ldr x22, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "mov x12, #0x0\n" "cbz x9, 3f\n" "2:" // K loop: Charge: Loop - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n" + "ldr x22, [x26, #0x8]\n" + "add x26, x26, #0x10\n" + ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n" "add x12, x12, #0x8\n" "cmp x12, x9, LSL #2\n" + "ldr x21, [x25, #0x8]\n" + "add x25, x25, #0x10\n" "blt 2b\n" "3:" // K loop: Charge: End - ".inst 0x25246140 // dup p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c02e0 // ld1b { za0h.b[x12] }, p0/Z, [x23, x28]\n" - ".inst 0x252c6140 // dup p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0x25646141 // dup p1.b, p8.b/Z, p10.b[w12, #4]\n" - ".inst 0xe01c02c1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256c6140 // dup p0.b, p8.b/Z, p10.b[w12, #5]\n" - "mov x25, %x[in]\n" - ".inst 0xe01c06a4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x21, x28]\n" - "add x24, %x[in], x16, LSL #3\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + ".inst 0xe01c02e1 // ld1b { za0h.b[x12, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25646141 // psel p1.b, p8.b/Z, p10.b[w12, #4]\n" + ".inst 0x256c6140 // psel p0.b, p8.b/Z, p10.b[w12, #5]\n" + ".inst 0xe01c06c4 // ld1b { za0h.b[x12, #4] }, p1/Z, [x22, x28]\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c02a5 // ld1b { za0h.b[x12, #5] }, p0/Z, [x21, x28]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c0285 // ld1b { za0h.b[x12, #5] }, p0/Z, [x20, x28]\n" - "ldr x22, [x24, #0x0]\n" + "incb x28\n" + "incb x15\n" + "ldr x22, [x26, #0x8]\n" + "add x26, x26, #0x10\n" "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "incb x28\n" - "incb x26\n" - "cbz x13, 9f\n" - "mov x19, x13\n" + "cbz x20, 9f\n" + "mov x20, x20\n" "4:" // K loop: Main loop - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x9, 6f\n" "5:" // K loop: Main loop: First: Loop - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + ".inst 0x25756140 // psel p0.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0x257d6142 // psel p2.b, p8.b/Z, p10.b[w13, #7]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" + ".inst 0xe01c22c6 // ld1b { za0h.b[x13, #6] }, p0/Z, [x22, x28]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0828831 // mova z17.s, p2/M, za0v.s[x12, #1]\n" - "udot z18.s, z16.b, z20.b\n" - ".inst 0xc08288b0 // mova z16.s, p2/M, za1v.s[x12, #1]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "udot z19.s, z17.b, z20.b\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "udot z18.s, z16.b, z20.b\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" + ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "udot z19.s, z17.b, z20.b\n" + "udot z18.s, z16.b, z20.b\n" + "addvl x27, x27, #4\n" + "add x13, x13, #0x8\n" "blt 5b\n" "6:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25356140 // dup p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe01c22e2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x23, x28]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0x25756141 // dup p1.b, p8.b/Z, p10.b[w13, #6]\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - ".inst 0x257d6140 // dup p0.b, p8.b/Z, p10.b[w13, #7]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2287 // ld1b { za0h.b[x13, #7] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + ".inst 0x25756141 // psel p1.b, p8.b/Z, p10.b[w13, #6]\n" + ".inst 0x257d6140 // psel p0.b, p8.b/Z, p10.b[w13, #7]\n" + ".inst 0xe01c26c6 // ld1b { za0h.b[x13, #6] }, p1/Z, [x22, x28]\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0828831 // mova z17.s, p2/M, za0v.s[x12, #1]\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" "udot z18.s, z16.b, z20.b\n" - ".inst 0xc08288b0 // mova z16.s, p2/M, za1v.s[x12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" + "ldr x23, [x25, #0x0]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "ldr x21, [x25, #0x8]\n" + ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" + "whilelt p9.b, x15, %x[width]\n" + ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" + "incb x15\n" + "add x26, x26, #0x10\n" "udot z19.s, z17.b, z20.b\n" - "incb x26\n" + ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" + "add x25, x25, #0x10\n" "udot z18.s, z16.b, z20.b\n" "incb x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" - "whilelt p8.b, x26, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "cbz x9, 8f\n" "7:" // K loop: Main loop: Second: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25656140 // psel p0.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0x256d6142 // psel p2.b, p8.b/Z, p10.b[w13, #5]\n" "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" + ".inst 0xe01c22c4 // ld1b { za0h.b[x13, #4] }, p0/Z, [x22, x28]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0828931 // mova z17.s, p2/M, za2v.s[x12, #1]\n" - "udot z18.s, z16.b, z20.b\n" - ".inst 0xc08289b0 // mova z16.s, p2/M, za3v.s[x12, #1]\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "add x13, x13, #0x8\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "udot z19.s, z17.b, z20.b\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "udot z18.s, z16.b, z20.b\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" + ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "udot z19.s, z17.b, z20.b\n" + "udot z18.s, z16.b, z20.b\n" + "addvl x27, x27, #4\n" + "add x13, x13, #0x8\n" "blt 7b\n" "8:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe01c22e0 // ld1b { za0h.b[x13] }, p0/Z, [x23, x28]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25656141 // dup p1.b, p8.b/Z, p10.b[w13, #4]\n" - ".inst 0xe01c22c1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x22, x28]\n" - ".inst 0x256d6140 // dup p0.b, p8.b/Z, p10.b[w13, #5]\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26a4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x21, x28]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - ".inst 0xe01c2285 // ld1b { za0h.b[x13, #5] }, p0/Z, [x20, x28]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - "add x24, x24, #0x10\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25256140 // psel p0.b, p8.b/Z, p10.b[w13]\n" + ".inst 0xe01c2300 // ld1b { za0h.b[x13] }, p0/Z, [x24, x28]\n" + ".inst 0x252d6140 // psel p0.b, p8.b/Z, p10.b[w13, #1]\n" + ".inst 0xe01c22e1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x23, x28]\n" + ".inst 0x25656141 // psel p1.b, p8.b/Z, p10.b[w13, #4]\n" + ".inst 0x256d6140 // psel p0.b, p8.b/Z, p10.b[w13, #5]\n" + ".inst 0xe01c26c4 // ld1b { za0h.b[x13, #4] }, p1/Z, [x22, x28]\n" + "mov x26, %x[in]\n" + "add x25, %x[in], x16, LSL #3\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n" + ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0828931 // mova z17.s, p2/M, za2v.s[x12, #1]\n" + ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" "udot z18.s, z16.b, z20.b\n" - ".inst 0xc08289b0 // mova z16.s, p2/M, za3v.s[x12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25706d21 // dup p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0x25706d20 // dup p0.s, p11.s/Z, p9.s[w12, #1]\n" - "whilelt p9.b, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" + "ldr x23, [x25, #0x0]\n" + ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" + "ldr x22, [x26, #0x8]\n" + ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" + ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" + "ldr x21, [x25, #0x8]\n" + ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" + ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" + "whilelt p9.b, x15, %x[width]\n" + ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" + "subs x20, x20, #0x1\n" + "add x26, x26, #0x10\n" "udot z19.s, z17.b, z20.b\n" - "incb x26\n" + ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" + "add x25, x25, #0x10\n" "udot z18.s, z16.b, z20.b\n" - "incb x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" + "incb x15\n" + ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + "incb x28\n" "bgt 4b\n" "9:" // K loop: Tails - "cbnz x11, 12f\n" - "mov x25, %x[in]\n" - "whilelt p8.b, x26, %x[width]\n" + "cbnz x10, 12f\n" + "mov x26, %x[in]\n" + "whilelt p8.b, x15, %x[width]\n" "mov x13, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: First - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" + "ldr x24, [x26, #0x0]\n" + ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + "ldr x23, [x26, x16, LSL #0x3]\n" + ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + "add x12, x12, #0x1\n" + ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" + "cmp x12, x16\n" "udot z19.s, z17.b, z20.b\n" "udot z18.s, z16.b, z20.b\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe01c26e2 // ld1b { za0h.b[x13, #2] }, p1/Z, [x23, x28]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" + ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + "add x26, x26, #0x8\n" "addvl x27, x27, #2\n" - ".inst 0xe01c22c3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x22, x28]\n" - "add x25, x25, #0x8\n" "add x13, x13, #0x4\n" - "add x12, x12, #0x1\n" - "cmp x12, x16\n" "blt 10b\n" - "whilelt p9.b, x26, %x[width]\n" - "whilelt p8.b, x26, %x[width]\n" - "mov x19, #0x0\n" + "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" + "mov x20, #0x0\n" "mov x12, #0x0\n" "11:" // K loop: Tails: Even: Second - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "add x19, x19, #0x4\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" + "add x12, x12, #0x1\n" + "cmp x12, x17\n" "udot z19.s, z17.b, z20.b\n" "udot z18.s, z16.b, z20.b\n" - "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "addvl x27, x27, #2\n" + "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x26, %x[width]\n" + "whilelt p9.b, x15, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" "13:" // K loop: Tails: Odd: Loop - ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0828890 // mova z16.s, p2/M, za1v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + "add x12, x12, #0x1\n" + "cmp x12, x17\n" "udot z19.s, z17.b, z20.b\n" "udot z18.s, z16.b, z20.b\n" - "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "addvl x27, x27, #2\n" "blt 13b\n" "14:" // K loop: End - "st1w { z19.s }, p2, [x27]\n" - "st1w { z18.s }, p2, [x27, #1, MUL VL]\n" + "st1w { z19.s }, p4, [x27]\n" + "st1w { z18.s }, p4, [x27, #1, MUL VL]\n" "addvl x27, x27, #2\n" "mov %x[out], x27\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p8", "p9", "p10", "p11", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp index 3ea616f007..522f310cc0 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,66 +34,66 @@ void interleave_block<2, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cnth x28\n" + "cmp %x[height], x28\n" "cnth x27\n" - "cmp %x[height], x27\n" - "cnth x26\n" - "csel x27, %x[height], x27, LT\n" - "mov x25, #0x0\n" + "csel x28, %x[height], x28, LT\n" + "mov x26, #0x0\n" "ptrue p13.s\n" - "sub x27, x27, #0x1\n" + "sub x28, x28, #0x1\n" "whilelt p12.h, XZR, %x[height]\n" - "whilelt p11.h, x26, %x[height]\n" - "mov x24, %x[row_offset]\n" - "mov x23, %x[out]\n" - "whilelt p10.h, x25, %x[width]\n" - "whilelt p9.h, x25, %x[width]\n" - "whilelt p8.h, x25, %x[width]\n" + "whilelt p11.h, x27, %x[height]\n" + "mov x25, %x[row_offset]\n" + "mov x24, %x[out]\n" + "whilelt p10.h, x26, %x[width]\n" + "whilelt p9.h, x26, %x[width]\n" + "whilelt p8.h, x26, %x[width]\n" "1:" // Width loop - "add x22, %x[in], XZR, LSL #3\n" - "add x19, %x[in], x26, LSL #3\n" - "ldr x21, [x22], #0x8\n" + "add x23, %x[in], XZR, LSL #3\n" + "add x20, %x[in], x27, LSL #3\n" + "ldr x22, [x23], #0x8\n" "mov x12, #0x0\n" - "ldr x20, [x19], #0x8\n" - "cbz x27, 3f\n" + "ldr x21, [x20], #0x8\n" + "cbz x28, 3f\n" "2:" // Loads: Loop ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe05806a0 // ld1h { za0h.h[x12] }, p1/Z, [x21, x24, LSL #1]\n" - "ldr x21, [x22], #0x8\n" - ".inst 0xe0580288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x24, LSL #1]\n" + ".inst 0xe05906c0 // ld1h { za0h.h[x12] }, p1/Z, [x22, x25, LSL #1]\n" + "ldr x22, [x23], #0x8\n" + ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x27, LSL #1\n" - "ldr x20, [x19], #0x8\n" + "cmp x12, x28, LSL #1\n" + "ldr x21, [x20], #0x8\n" "blt 2b\n" "3:" // Loads: Tail - "sub x19, %x[width], x25\n" + "sub x20, %x[width], x26\n" ".inst 0x25286580 // psel p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe05802a0 // ld1h { za0h.h[x12] }, p0/Z, [x21, x24, LSL #1]\n" + ".inst 0xe05902c0 // ld1h { za0h.h[x12] }, p0/Z, [x22, x25, LSL #1]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - "cmp x19, x26\n" - ".inst 0xe0580288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x24, LSL #1]\n" + "cmp x20, x27\n" + ".inst 0xe05902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x25, LSL #1]\n" "mov x12, #0x0\n" - "csel x19, x19, x26, LT\n" + "csel x20, x20, x27, LT\n" "4:" // Stores: Loop ".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n" - ".inst 0xe07f82e0 // st1h { za0v.h[x12] }, p0/Z, [x23, XZR, LSL #1]\n" + ".inst 0xe07f8300 // st1h { za0v.h[x12] }, p0/Z, [x24, XZR, LSL #1]\n" ".inst 0x25287540 // psel p0.h, p13.h/Z, p10.h[w12]\n" - ".inst 0xe07a82e8 // st1h { za1v.h[x12] }, p0/Z, [x23, x26, LSL #1]\n" + ".inst 0xe07b8308 // st1h { za1v.h[x12] }, p0/Z, [x24, x27, LSL #1]\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "addvl x23, x23, #4\n" + "cmp x12, x20\n" + "addvl x24, x24, #4\n" "blt 4b\n" + "inch x26\n" + "whilelt p10.h, x26, %x[width]\n" + "whilelt p9.h, x26, %x[width]\n" + "whilelt p8.h, x26, %x[width]\n" "inch x25\n" - "whilelt p10.h, x25, %x[width]\n" - "whilelt p9.h, x25, %x[width]\n" - "whilelt p8.h, x25, %x[width]\n" - "inch x24\n" "b.any 1b\n" - "mov %x[out], x23\n" + "mov %x[out], x24\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp index d7025420e9..949e003598 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,256 +34,256 @@ void interleave_block<2, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "mov x22, %x[width]\n" + "incw x22\n" "cntw x16\n" - "cntw x15, ALL, MUL #2\n" - "cntw x14, ALL, MUL #3\n" - "mov x19, %x[width]\n" - "incw x19\n" - "sub x19, x19, #0x1\n" - "udiv x19, x19, x16\n" // n_passes = ceildiv(width, VL) - "sub x13, x19, #0x1\n" - "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x11, x19, #0x1\n" // odd_tail = bool(n_passes & 0x1) - "mov x19, %x[width]\n" - "sub x10, x16, #0x1\n" - "ands x10, x19, x10\n" - "csel x10, x10, x16, NE\n" - "sub x9, x16, #0x2\n" + "sub x22, x22, #0x1\n" + "udiv x22, x22, x16\n" // n_passes = ceildiv(width, VL) + "mov x21, %x[width]\n" + "sub x15, x16, #0x1\n" + "sub x20, x22, #0x1\n" + "ands x15, x21, x15\n" + "sub x14, x16, #0x2\n" + "mov x13, #0x0\n" + "mov x11, %x[in]\n" + "ldr x10, [x11, #0x0]\n" + "add x9, %x[in], x16, LSL #3\n" + "cntw x28, ALL, MUL #2\n" + "ldr x27, [x9, #0x0]\n" + "cntw x26, ALL, MUL #3\n" + "lsr x20, x20, #0x1\n" // n_loops = (n_passes - 1) / 2 + "ldr x25, [x11, #0x8]\n" + "and x24, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "csel x15, x15, x16, NE\n" + "ldr x23, [x9, #0x8]\n" "ptrue p13.s\n" "whilelt p12.s, XZR, %x[height]\n" "whilelt p11.s, x16, %x[height]\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "mov x26, #0x0\n" - "whilelt p10.s, x26, %x[width]\n" - "whilelt p9.s, x26, %x[width]\n" - "whilelt p8.s, x26, %x[width]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + "mov x22, %x[row_offset]\n" + "mov x21, %x[out]\n" + "whilelt p10.s, x13, %x[width]\n" + "whilelt p9.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" "mov x12, #0x0\n" - "cbz x9, 2f\n" + "cbz x14, 2f\n" "1:" // K loop: Charge: Loop - ".inst 0x25306580 // dup p0.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe09c02e0 // ld1w { za0h.s[x12] }, p0/Z, [x23, x28, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe09c02c4 // ld1w { za1h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" - ".inst 0x25706580 // dup p0.s, p9.s/Z, p12.s[w12, #1]\n" - ".inst 0xe09c02a1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x21, x28, LSL #2]\n" - ".inst 0x25706160 // dup p0.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe09c0285 // ld1w { za1h.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n" + ".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n" + "ldr x27, [x9, #0x0]\n" + ".inst 0xe0960721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n" + "ldr x25, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + ".inst 0xe09602e5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x14\n" + "ldr x23, [x9, #0x8]\n" + "add x9, x9, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End - ".inst 0x25306580 // dup p0.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe09c02e0 // ld1w { za0h.s[x12] }, p0/Z, [x23, x28, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe09c02c4 // ld1w { za1h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" - ".inst 0x25706580 // dup p0.s, p9.s/Z, p12.s[w12, #1]\n" - ".inst 0xe09c02a1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x21, x28, LSL #2]\n" - ".inst 0x25706160 // dup p0.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe09c0285 // ld1w { za1h.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "incw x28\n" - "incw x26\n" - "cbz x13, 8f\n" - "mov x19, x13\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n" + ".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n" + "mov x11, %x[in]\n" + "add x9, %x[in], x16, LSL #3\n" + ".inst 0xe0960721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0xe09602e5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n" + "ldr x27, [x9, #0x0]\n" + "incw x22\n" + "incw x13\n" + "ldr x25, [x11, #0x8]\n" + "add x11, x11, #0x10\n" + "ldr x23, [x9, #0x8]\n" + "add x9, x9, #0x10\n" + "cbz x20, 8f\n" + "mov x20, x20\n" "3:" // K loop: Main loop - "whilelt p9.s, x26, %x[width]\n" - "whilelt p8.s, x26, %x[width]\n" + "whilelt p9.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" "mov x12, #0x0\n" - "cbz x9, 5f\n" + "cbz x14, 5f\n" "4:" // K loop: Main loop: First: Loop - ".inst 0x25306580 // dup p0.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe09c02e8 // ld1w { za2h.s[x12] }, p0/Z, [x23, x28, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe09c02cc // ld1w { za3h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" - ".inst 0x25706580 // dup p0.s, p9.s/Z, p12.s[w12, #1]\n" - ".inst 0xe09c02a9 // ld1w { za2h.s[x12, #1] }, p0/Z, [x21, x28, LSL #2]\n" - ".inst 0x25706160 // dup p0.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe09c028d // ld1w { za3h.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" + ".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n" + "ldr x27, [x9, #0x0]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0960329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + "ldr x25, [x11, #0x8]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0960aed // ld1w { za3h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n" + "ldr x23, [x9, #0x8]\n" + ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x9, x9, #0x10\n" + ".inst 0xe0ba82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x14\n" + "addvl x21, x21, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25306580 // dup p0.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe09c02e8 // ld1w { za2h.s[x12] }, p0/Z, [x23, x28, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe09c02cc // ld1w { za3h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" - ".inst 0x25706580 // dup p0.s, p9.s/Z, p12.s[w12, #1]\n" - ".inst 0xe09c02a9 // ld1w { za2h.s[x12, #1] }, p0/Z, [x21, x28, LSL #2]\n" - ".inst 0x25706160 // dup p0.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe09c028d // ld1w { za3h.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - "whilelt p10.s, x26, %x[width]\n" - ".inst 0xe0af8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "incw x26\n" - "incw x28\n" - ".inst 0xe0ae8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "whilelt p9.s, x26, %x[width]\n" - "whilelt p8.s, x26, %x[width]\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + "mov x11, %x[in]\n" + "add x9, %x[in], x16, LSL #3\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" + ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n" + ".inst 0xe0960329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + "ldr x27, [x9, #0x0]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe09606ed // ld1w { za3h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + "ldr x25, [x11, #0x8]\n" + ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" + "ldr x23, [x9, #0x8]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p10.s, x13, %x[width]\n" + "incw x13\n" + ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + ".inst 0xe0ba82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" + "addvl x21, x21, #4\n" + "incw x22\n" + "whilelt p9.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" "mov x12, #0x0\n" - "cbz x9, 7f\n" + "cbz x14, 7f\n" "6:" // K loop: Main loop: Second: Loop - ".inst 0x25306580 // dup p0.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe09c02e0 // ld1w { za0h.s[x12] }, p0/Z, [x23, x28, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe09c02c4 // ld1w { za1h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" - ".inst 0x25706580 // dup p0.s, p9.s/Z, p12.s[w12, #1]\n" - ".inst 0xe09c02a1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x21, x28, LSL #2]\n" - ".inst 0x25706160 // dup p0.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe09c0285 // ld1w { za1h.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" + ".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n" + "ldr x27, [x9, #0x0]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0960321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + "ldr x25, [x11, #0x8]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0960ae5 // ld1w { za1h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n" + "ldr x23, [x9, #0x8]\n" + ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + "add x11, x11, #0x10\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x9, x9, #0x10\n" + ".inst 0xe0ba82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" "add x12, x12, #0x2\n" - "cmp x12, x9\n" + "cmp x12, x14\n" + "addvl x21, x21, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail - "mov x25, %x[in]\n" - "add x24, %x[in], x16, LSL #3\n" - ".inst 0x25306580 // dup p0.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe09c02e0 // ld1w { za0h.s[x12] }, p0/Z, [x23, x28, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe09c02c4 // ld1w { za1h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" - ".inst 0x25706580 // dup p0.s, p9.s/Z, p12.s[w12, #1]\n" - ".inst 0xe09c02a1 // ld1w { za0h.s[x12, #1] }, p0/Z, [x21, x28, LSL #2]\n" - ".inst 0x25706160 // dup p0.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe09c0285 // ld1w { za1h.s[x12, #1] }, p0/Z, [x20, x28, LSL #2]\n" - "ldr x23, [x25, #0x0]\n" - "ldr x22, [x24, #0x0]\n" - "ldr x21, [x25, #0x8]\n" - "ldr x20, [x24, #0x8]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25707541 // dup p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25707540 // dup p0.s, p13.s/Z, p10.s[w12, #1]\n" - "whilelt p10.s, x26, %x[width]\n" - ".inst 0xe0af8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x15, LSL #2]\n" - "incw x26\n" - "incw x28\n" - ".inst 0xe0ae836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" - "addvl x27, x27, #4\n" - "subs x19, x19, #0x1\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + "mov x11, %x[in]\n" + "add x9, %x[in], x16, LSL #3\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" + ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n" + ".inst 0xe0960321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + "ldr x27, [x9, #0x0]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe09606e5 // ld1w { za1h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + "ldr x25, [x11, #0x8]\n" + ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" + "ldr x23, [x9, #0x8]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" + ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + "whilelt p10.s, x13, %x[width]\n" + "subs x20, x20, #0x1\n" + ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + "add x11, x11, #0x10\n" + "add x9, x9, #0x10\n" + ".inst 0xe0ba82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" + "addvl x21, x21, #4\n" + "incw x13\n" + "incw x22\n" "bgt 3b\n" "8:" // K loop: Tails - "cbnz x11, 11f\n" - "mov x25, %x[in]\n" - "whilelt p9.s, x26, %x[width]\n" - "whilelt p8.s, x26, %x[width]\n" + "cbnz x24, 11f\n" + "mov x11, %x[in]\n" + "whilelt p9.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25306581 // dup p1.s, p9.s/Z, p12.s[w12]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0x25306160 // dup p0.s, p8.s/Z, p11.s[w12]\n" - "addvl x27, x27, #2\n" - "ldr x23, [x25, #0x0]\n" - ".inst 0xe09c06e8 // ld1w { za2h.s[x12] }, p1/Z, [x23, x28, LSL #2]\n" - "ldr x22, [x25, x16, LSL #0x3]\n" - ".inst 0xe09c02cc // ld1w { za3h.s[x12] }, p0/Z, [x22, x28, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + "ldr x10, [x11, #0x0]\n" + ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" + ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" + "ldr x27, [x11, x16, LSL #0x3]\n" + ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + "add x11, x11, #0x8\n" + "addvl x21, x21, #2\n" + ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x16\n" - "add x25, x25, #0x8\n" "blt 9b\n" - "whilelt p10.s, x26, %x[width]\n" - "whilelt p9.s, x26, %x[width]\n" - "whilelt p8.s, x26, %x[width]\n" + "whilelt p10.s, x13, %x[width]\n" + "whilelt p9.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x15\n" + "addvl x21, x21, #2\n" "blt 10b\n" - "whilelt p10.s, x26, %x[width]\n" + "whilelt p10.s, x13, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "addvl x27, x27, #2\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x15\n" + "addvl x21, x21, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x27\n" + "mov %x[out], x21\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "p12", "p13", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp index 556d1481de..4cc84d344a 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,93 +34,93 @@ void interleave_block<4, 2, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x16\n" "cntw x15\n" "cntw x14, ALL, MUL #2\n" "cntw x13, ALL, MUL #3\n" - "cnth x11\n" - "ptrue p13.s\n" - "cntw x10\n" - "cmp %x[height], x10\n" - "csel x10, %x[height], x10, LT\n" - "sub x10, x10, #0x1\n" - "whilelt p10.h, XZR, %x[height]\n" - "whilelt p9.h, x15, %x[height]\n" - "whilelt p8.h, x14, %x[height]\n" - "zip1 p12.h, p10.h, p8.h\n" + "cmp %x[height], x16\n" + "csel x16, %x[height], x16, LT\n" + "whilelt p11.h, XZR, %x[height]\n" + "whilelt p10.h, x15, %x[height]\n" + "whilelt p9.h, x14, %x[height]\n" "whilelt p8.h, x13, %x[height]\n" - "zip1 p11.h, p9.h, p8.h\n" + "mov x11, #0x0\n" + "cnth x10\n" + "ptrue p13.s\n" + "sub x16, x16, #0x1\n" + "zip1 p12.h, p11.h, p9.h\n" + "zip1 p11.h, p10.h, p8.h\n" "mov x9, %x[row_offset]\n" "mov x28, %x[out]\n" - "mov x27, #0x0\n" - "whilelt p10.h, x27, %x[width]\n" - "whilelt p9.h, x27, %x[width]\n" - "whilelt p8.h, x27, %x[width]\n" + "whilelt p10.h, x11, %x[width]\n" + "whilelt p9.h, x11, %x[width]\n" + "whilelt p8.h, x11, %x[width]\n" "1:" // Width loop - "mov x12, #0x0\n" - "add x26, %x[in], XZR, LSL #3\n" - "add x25, %x[in], x15, LSL #3\n" + "add x27, %x[in], XZR, LSL #3\n" + "add x26, %x[in], x15, LSL #3\n" + "ldr x25, [x27], #0x8\n" "add x24, %x[in], x14, LSL #3\n" - "add x23, %x[in], x13, LSL #3\n" - "ldr x22, [x26], #0x8\n" - "ldr x21, [x25], #0x8\n" - "ldr x20, [x24], #0x8\n" - "ldr x19, [x23], #0x8\n" - "cbz x10, 3f\n" + "add x20, %x[in], x13, LSL #3\n" + "ldr x23, [x26], #0x8\n" + "mov x12, #0x0\n" + "ldr x22, [x24], #0x8\n" + "ldr x21, [x20], #0x8\n" + "cbz x16, 3f\n" "2:" // Loads: Loop - ".inst 0x25286580 // dup p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe04902c0 // ld1h { za0h.h[x12] }, p0/Z, [x22, x9, LSL #1]\n" - ".inst 0x25286160 // dup p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe04902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x9, LSL #1]\n" - ".inst 0x25386580 // dup p0.h, p9.h/Z, p12.h[w12, #1]\n" - ".inst 0xe0490281 // ld1h { za0h.h[x12, #1] }, p0/Z, [x20, x9, LSL #1]\n" - ".inst 0x25386160 // dup p0.h, p8.h/Z, p11.h[w12, #1]\n" - ".inst 0xe0490269 // ld1h { za1h.h[x12, #1] }, p0/Z, [x19, x9, LSL #1]\n" - "ldr x22, [x26], #0x8\n" - "ldr x21, [x25], #0x8\n" - "ldr x20, [x24], #0x8\n" - "ldr x19, [x23], #0x8\n" + ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" + ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" + ".inst 0xe0490720 // ld1h { za0h.h[x12] }, p1/Z, [x25, x9, LSL #1]\n" + "ldr x25, [x27], #0x8\n" + ".inst 0xe04902e8 // ld1h { za1h.h[x12] }, p0/Z, [x23, x9, LSL #1]\n" + ".inst 0x25386581 // psel p1.h, p9.h/Z, p12.h[w12, #1]\n" + ".inst 0x25386160 // psel p0.h, p8.h/Z, p11.h[w12, #1]\n" + "ldr x23, [x26], #0x8\n" + ".inst 0xe04906c1 // ld1h { za0h.h[x12, #1] }, p1/Z, [x22, x9, LSL #1]\n" + "ldr x22, [x24], #0x8\n" + ".inst 0xe04902a9 // ld1h { za1h.h[x12, #1] }, p0/Z, [x21, x9, LSL #1]\n" "add x12, x12, #0x2\n" - "cmp x12, x10, LSL #1\n" + "cmp x12, x16, LSL #1\n" + "ldr x21, [x20], #0x8\n" "blt 2b\n" "3:" // Loads: Tail - ".inst 0x25286580 // dup p0.h, p9.h/Z, p12.h[w12]\n" - ".inst 0xe04902c0 // ld1h { za0h.h[x12] }, p0/Z, [x22, x9, LSL #1]\n" - ".inst 0x25286160 // dup p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe04902a8 // ld1h { za1h.h[x12] }, p0/Z, [x21, x9, LSL #1]\n" - ".inst 0x25386580 // dup p0.h, p9.h/Z, p12.h[w12, #1]\n" - ".inst 0xe0490281 // ld1h { za0h.h[x12, #1] }, p0/Z, [x20, x9, LSL #1]\n" - ".inst 0x25386160 // dup p0.h, p8.h/Z, p11.h[w12, #1]\n" - ".inst 0xe0490269 // ld1h { za1h.h[x12, #1] }, p0/Z, [x19, x9, LSL #1]\n" + ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" + ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" + ".inst 0xe0490720 // ld1h { za0h.h[x12] }, p1/Z, [x25, x9, LSL #1]\n" + "sub x20, %x[width], x11\n" + ".inst 0xe04902e8 // ld1h { za1h.h[x12] }, p0/Z, [x23, x9, LSL #1]\n" + "cmp x20, x10\n" + "csel x20, x20, x10, LT\n" + ".inst 0x25386580 // psel p0.h, p9.h/Z, p12.h[w12, #1]\n" + ".inst 0xe04902c1 // ld1h { za0h.h[x12, #1] }, p0/Z, [x22, x9, LSL #1]\n" + ".inst 0x25386160 // psel p0.h, p8.h/Z, p11.h[w12, #1]\n" + "add x20, x20, #0x1\n" + ".inst 0xe04902a9 // ld1h { za1h.h[x12, #1] }, p0/Z, [x21, x9, LSL #1]\n" "mov x12, #0x0\n" - "sub x19, %x[width], x27\n" - "cmp x19, x11\n" - "csel x19, x19, x11, LT\n" - "add x19, x19, #0x1\n" - "lsr x19, x19, #0x1\n" + "lsr x20, x20, #0x1\n" "4:" // Stores: Loop - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" ".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0x25307541 // dup p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" ".inst 0xe0af8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n" - ".inst 0x25307540 // dup p0.s, p13.s/Z, p10.s[w12]\n" + ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" + ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" ".inst 0xe0ae8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n" ".inst 0xe0ad838c // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n" - "addvl x28, x28, #4\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" + "cmp x12, x20\n" + "addvl x28, x28, #4\n" "blt 4b\n" + "inch x11\n" + "whilelt p10.h, x11, %x[width]\n" + "whilelt p9.h, x11, %x[width]\n" + "whilelt p8.h, x11, %x[width]\n" "inch x9\n" - "inch x27\n" - "whilelt p10.h, x27, %x[width]\n" - "whilelt p9.h, x27, %x[width]\n" - "whilelt p8.h, x27, %x[width]\n" "b.any 1b\n" "mov %x[out], x28\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p8", "p9", "p10", "p11", "p12", "p13", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp index 49d1aa5cc5..465939c30d 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,92 +34,92 @@ void interleave_block<4, 4, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x16\n" "cntw x15\n" - "cntw x14\n" - "cntw x13, ALL, MUL #2\n" - "cntw x11, ALL, MUL #3\n" - "cmp %x[height], x15\n" - "csel x15, %x[height], x15, LT\n" + "cntw x14, ALL, MUL #2\n" + "cntw x13, ALL, MUL #3\n" + "cmp %x[height], x16\n" + "csel x16, %x[height], x16, LT\n" "whilelt p12.b, XZR, %x[height]\n" - "whilelt p10.b, x14, %x[height]\n" - "whilelt p9.b, x13, %x[height]\n" - "whilelt p8.b, x11, %x[height]\n" + "whilelt p10.b, x15, %x[height]\n" + "whilelt p9.b, x14, %x[height]\n" + "whilelt p8.b, x13, %x[height]\n" "zip1 p12.b, p12.b, p9.b\n" "zip1 p10.b, p10.b, p8.b\n" - "mov x10, #0x0\n" - "cntb x9\n" + "mov x11, #0x0\n" + "cntb x10\n" "ptrue p11.s\n" - "sub x15, x15, #0x1\n" + "sub x16, x16, #0x1\n" "zip1 p10.b, p12.b, p10.b\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "whilelt p9.b, x10, %x[width]\n" - "whilelt p8.b, x10, %x[width]\n" + "mov x9, %x[row_offset]\n" + "mov x28, %x[out]\n" + "whilelt p9.b, x11, %x[width]\n" + "whilelt p8.b, x11, %x[width]\n" "1:" // Width loop - "add x26, %x[in], XZR, LSL #3\n" - "add x25, %x[in], x14, LSL #3\n" - "ldr x24, [x26], #0x8\n" + "add x27, %x[in], XZR, LSL #3\n" + "add x26, %x[in], x15, LSL #3\n" + "ldr x25, [x27], #0x8\n" + "add x24, %x[in], x14, LSL #3\n" "add x23, %x[in], x13, LSL #3\n" - "add x22, %x[in], x11, LSL #3\n" - "ldr x19, [x25], #0x8\n" + "ldr x20, [x26], #0x8\n" "mov x12, #0x0\n" + "ldr x22, [x24], #0x8\n" "ldr x21, [x23], #0x8\n" - "ldr x20, [x22], #0x8\n" - "cbz x15, 3f\n" + "cbz x16, 3f\n" "2:" // Loads: Loop ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n" ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" - "ldr x24, [x26], #0x8\n" - ".inst 0xe01c0261 // ld1b { za0h.b[x12, #1] }, p0/Z, [x19, x28]\n" + "ldr x25, [x27], #0x8\n" + ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n" ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n" ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" - "ldr x19, [x25], #0x8\n" - ".inst 0xe01c06a2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x21, x28]\n" - "ldr x21, [x23], #0x8\n" - ".inst 0xe01c0283 // ld1b { za0h.b[x12, #3] }, p0/Z, [x20, x28]\n" + "ldr x20, [x26], #0x8\n" + ".inst 0xe00906c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x9]\n" + "ldr x22, [x24], #0x8\n" + ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n" "add x12, x12, #0x4\n" - "cmp x12, x15, LSL #2\n" - "ldr x20, [x22], #0x8\n" + "cmp x12, x16, LSL #2\n" + "ldr x21, [x23], #0x8\n" "blt 2b\n" "3:" // Loads: Tail ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n" ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0xe01c0261 // ld1b { za0h.b[x12, #1] }, p0/Z, [x19, x28]\n" + ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n" ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n" - "sub x19, %x[width], x10\n" - ".inst 0xe01c02a2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x21, x28]\n" - "cmp x19, x9\n" - "csel x19, x19, x9, LT\n" + "sub x20, %x[width], x11\n" + ".inst 0xe00902c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x9]\n" + "cmp x20, x10\n" + "csel x20, x20, x10, LT\n" ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" - "add x19, x19, #0x3\n" - ".inst 0xe01c0283 // ld1b { za0h.b[x12, #3] }, p0/Z, [x20, x28]\n" + "add x20, x20, #0x3\n" + ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n" "mov x12, #0x0\n" - "lsr x19, x19, #0x2\n" + "lsr x20, x20, #0x2\n" "4:" // Stores: Loop ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" + ".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0ae8364 // st1w { za1v.s[x12] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0xe0af8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n" ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0ad8768 // st1w { za2v.s[x12] }, p1/Z, [x27, x13, LSL #2]\n" - ".inst 0xe0ab836c // st1w { za3v.s[x12] }, p0/Z, [x27, x11, LSL #2]\n" + ".inst 0xe0ae8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n" + ".inst 0xe0ad838c // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "addvl x27, x27, #4\n" + "cmp x12, x20\n" + "addvl x28, x28, #4\n" "blt 4b\n" - "incb x10\n" - "whilelt p9.b, x10, %x[width]\n" - "whilelt p8.b, x10, %x[width]\n" - "incb x28\n" + "incb x11\n" + "whilelt p9.b, x11, %x[width]\n" + "whilelt p8.b, x11, %x[width]\n" + "incb x9\n" "b.any 1b\n" - "mov %x[out], x27\n" + "mov %x[out], x28\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp index 94673d41d8..ffd9384a13 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -32,118 +32,118 @@ void interleave_block<4, 4, VLType::SME, true>( { __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x16\n" + "cntw x15\n" "mov z24.b, #0x1\n" + "cntw x14, ALL, MUL #2\n" + "cntw x13, ALL, MUL #3\n" "mov z23.s, #0x0\n" - "ptrue p2.b\n" "mov z22.s, #0x0\n" - "cntw x16\n" + "cmp %x[height], x16\n" + "csel x16, %x[height], x16, LT\n" "mov z21.s, #0x0\n" - "cntw x15, ALL, MUL #2\n" "mov z20.s, #0x0\n" - "cntw x14, ALL, MUL #3\n" + "whilelt p12.b, XZR, %x[height]\n" + "whilelt p10.b, x15, %x[height]\n" + "whilelt p9.b, x14, %x[height]\n" + "whilelt p8.b, x13, %x[height]\n" + "zip1 p12.b, p12.b, p9.b\n" + "zip1 p10.b, p10.b, p8.b\n" + "ptrue p2.b\n" "cntb x11\n" "ptrue p11.s\n" - "cntw x10\n" - "cmp %x[height], x10\n" - "csel x10, %x[height], x10, LT\n" - "sub x10, x10, #0x1\n" - "whilelt p10.b, XZR, %x[height]\n" - "whilelt p9.b, x16, %x[height]\n" - "whilelt p8.b, x15, %x[height]\n" - "zip1 p10.b, p10.b, p8.b\n" - "whilelt p8.b, x14, %x[height]\n" - "zip1 p9.b, p9.b, p8.b\n" - "mov x9, %x[row_offset]\n" - "mov x28, %x[out]\n" - "zip1 p10.b, p10.b, p9.b\n" + "sub x16, x16, #0x1\n" + "zip1 p10.b, p12.b, p10.b\n" + "mov x10, %x[row_offset]\n" + "mov x9, %x[out]\n" "cbnz %x[first], 1f\n" - "addvl x28, x28, #-4\n" - "ld1w { z23.s }, p2/Z, [x28]\n" - "ld1w { z22.s }, p2/Z, [x28, #1, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [x28, #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x9, x9, #-4\n" + "ld1w { z23.s }, p2/Z, [x9]\n" + "ld1w { z22.s }, p2/Z, [x9, #1, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x9, #3, MUL VL]\n" "1:" // Initialise row sums: End - "mov x27, #0x0\n" - "whilelt p9.b, x27, %x[width]\n" - "whilelt p8.b, x27, %x[width]\n" + "mov x28, #0x0\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "2:" // Width loop - "mov x13, #0x0\n" - "add x26, %x[in], XZR, LSL #3\n" - "add x25, %x[in], x16, LSL #3\n" - "add x24, %x[in], x15, LSL #3\n" - "add x23, %x[in], x14, LSL #3\n" - "ldr x22, [x26], #0x8\n" - "ldr x21, [x25], #0x8\n" - "ldr x19, [x24], #0x8\n" - "ldr x20, [x23], #0x8\n" - "cbz x10, 4f\n" + "add x27, %x[in], XZR, LSL #3\n" + "add x26, %x[in], x15, LSL #3\n" + "ldr x25, [x27], #0x8\n" + "add x24, %x[in], x14, LSL #3\n" + "add x23, %x[in], x13, LSL #3\n" + "ldr x20, [x26], #0x8\n" + "mov x12, #0x0\n" + "ldr x22, [x24], #0x8\n" + "ldr x21, [x23], #0x8\n" + "cbz x16, 4f\n" "3:" // Loads: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe00922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x9]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe00922a1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x21, x9]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - "ldr x22, [x26], #0x8\n" - ".inst 0xe0092662 // ld1b { za0h.b[x13, #2] }, p1/Z, [x19, x9]\n" - "ldr x21, [x25], #0x8\n" - "ldr x19, [x24], #0x8\n" - ".inst 0xe0092283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x9]\n" - "ldr x20, [x23], #0x8\n" - "add x13, x13, #0x4\n" - "cmp x13, x10, LSL #2\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + "ldr x25, [x27], #0x8\n" + ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n" + ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n" + ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" + "ldr x20, [x26], #0x8\n" + ".inst 0xe00a06c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x10]\n" + "ldr x22, [x24], #0x8\n" + ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n" + "add x12, x12, #0x4\n" + "cmp x12, x16, LSL #2\n" + "ldr x21, [x23], #0x8\n" "blt 3b\n" "4:" // Loads: Tail - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe00922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x9]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe00922a1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x21, x9]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n" + ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n" + "sub x20, %x[width], x28\n" + ".inst 0xe00a02c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x10]\n" + "cmp x20, x11\n" + "csel x20, x20, x11, LT\n" + ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" + "add x20, x20, #0x3\n" + ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n" "mov x12, #0x0\n" - ".inst 0xe0092662 // ld1b { za0h.b[x13, #2] }, p1/Z, [x19, x9]\n" - "sub x19, %x[width], x27\n" - "cmp x19, x11\n" - ".inst 0xe0092283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x9]\n" - "csel x19, x19, x11, LT\n" - "add x19, x19, #0x3\n" - "lsr x19, x19, #0x2\n" + "lsr x20, x20, #0x2\n" "5:" // Stores: Loop - ".inst 0xc0828813 // mova z19.s, p2/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n" - ".inst 0xc0828892 // mova z18.s, p2/M, za1v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0xe0b08384 // st1w { za1v.s[x12] }, p0/Z, [x28, x16, LSL #2]\n" - ".inst 0x25306d21 // dup p1.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" - "sdot z23.s, z19.b, z24.b\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "sdot z22.s, z18.b, z24.b\n" - ".inst 0xe0af8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x15, LSL #2]\n" - "sdot z21.s, z17.b, z24.b\n" - "sdot z20.s, z16.b, z24.b\n" - ".inst 0xe0ae838c // st1w { za3v.s[x12] }, p0/Z, [x28, x14, LSL #2]\n" - "addvl x28, x28, #4\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0828812 // mova z18.s, p2/M, za0v.s[x12]\n" + ".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n" + ".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n" + ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" + "sdot z23.s, z18.b, z24.b\n" + ".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n" + ".inst 0xc0828993 // mova z19.s, p2/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" + "cmp x12, x20\n" + "sdot z22.s, z17.b, z24.b\n" + "sdot z21.s, z16.b, z24.b\n" + "addvl x9, x9, #4\n" + "sdot z20.s, z19.b, z24.b\n" "blt 5b\n" - "incb x9\n" - "incb x27\n" - "whilelt p9.b, x27, %x[width]\n" - "whilelt p8.b, x27, %x[width]\n" + "incb x28\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" + "incb x10\n" "b.any 2b\n" - "st1w { z23.s }, p2, [x28]\n" - "st1w { z22.s }, p2, [x28, #1, MUL VL]\n" - "st1w { z21.s }, p2, [x28, #2, MUL VL]\n" - "st1w { z20.s }, p2, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "mov %x[out], x28\n" + "st1w { z23.s }, p2, [x9]\n" + "st1w { z22.s }, p2, [x9, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z20.s }, p2, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "mov %x[out], x9\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p8", "p9", "p10", "p11", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp index bbdaaa3217..9f5db6ba3d 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,92 +34,92 @@ void interleave_block<4, 4, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x16\n" "cntw x15\n" - "cntw x14\n" - "cntw x13, ALL, MUL #2\n" - "cntw x11, ALL, MUL #3\n" - "cmp %x[height], x15\n" - "csel x15, %x[height], x15, LT\n" + "cntw x14, ALL, MUL #2\n" + "cntw x13, ALL, MUL #3\n" + "cmp %x[height], x16\n" + "csel x16, %x[height], x16, LT\n" "whilelt p12.b, XZR, %x[height]\n" - "whilelt p10.b, x14, %x[height]\n" - "whilelt p9.b, x13, %x[height]\n" - "whilelt p8.b, x11, %x[height]\n" + "whilelt p10.b, x15, %x[height]\n" + "whilelt p9.b, x14, %x[height]\n" + "whilelt p8.b, x13, %x[height]\n" "zip1 p12.b, p12.b, p9.b\n" "zip1 p10.b, p10.b, p8.b\n" - "mov x10, #0x0\n" - "cntb x9\n" + "mov x11, #0x0\n" + "cntb x10\n" "ptrue p11.s\n" - "sub x15, x15, #0x1\n" + "sub x16, x16, #0x1\n" "zip1 p10.b, p12.b, p10.b\n" - "mov x28, %x[row_offset]\n" - "mov x27, %x[out]\n" - "whilelt p9.b, x10, %x[width]\n" - "whilelt p8.b, x10, %x[width]\n" + "mov x9, %x[row_offset]\n" + "mov x28, %x[out]\n" + "whilelt p9.b, x11, %x[width]\n" + "whilelt p8.b, x11, %x[width]\n" "1:" // Width loop - "add x26, %x[in], XZR, LSL #3\n" - "add x25, %x[in], x14, LSL #3\n" - "ldr x24, [x26], #0x8\n" + "add x27, %x[in], XZR, LSL #3\n" + "add x26, %x[in], x15, LSL #3\n" + "ldr x25, [x27], #0x8\n" + "add x24, %x[in], x14, LSL #3\n" "add x23, %x[in], x13, LSL #3\n" - "add x22, %x[in], x11, LSL #3\n" - "ldr x19, [x25], #0x8\n" + "ldr x20, [x26], #0x8\n" "mov x12, #0x0\n" + "ldr x22, [x24], #0x8\n" "ldr x21, [x23], #0x8\n" - "ldr x20, [x22], #0x8\n" - "cbz x15, 3f\n" + "cbz x16, 3f\n" "2:" // Loads: Loop ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n" ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" - "ldr x24, [x26], #0x8\n" - ".inst 0xe01c0261 // ld1b { za0h.b[x12, #1] }, p0/Z, [x19, x28]\n" + "ldr x25, [x27], #0x8\n" + ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n" ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n" ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" - "ldr x19, [x25], #0x8\n" - ".inst 0xe01c06a2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x21, x28]\n" - "ldr x21, [x23], #0x8\n" - ".inst 0xe01c0283 // ld1b { za0h.b[x12, #3] }, p0/Z, [x20, x28]\n" + "ldr x20, [x26], #0x8\n" + ".inst 0xe00906c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x9]\n" + "ldr x22, [x24], #0x8\n" + ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n" "add x12, x12, #0x4\n" - "cmp x12, x15, LSL #2\n" - "ldr x20, [x22], #0x8\n" + "cmp x12, x16, LSL #2\n" + "ldr x21, [x23], #0x8\n" "blt 2b\n" "3:" // Loads: Tail ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" - ".inst 0xe01c0300 // ld1b { za0h.b[x12] }, p0/Z, [x24, x28]\n" + ".inst 0xe0090320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x9]\n" ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" - ".inst 0xe01c0261 // ld1b { za0h.b[x12, #1] }, p0/Z, [x19, x28]\n" + ".inst 0xe0090281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x9]\n" ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n" - "sub x19, %x[width], x10\n" - ".inst 0xe01c02a2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x21, x28]\n" - "cmp x19, x9\n" - "csel x19, x19, x9, LT\n" + "sub x20, %x[width], x11\n" + ".inst 0xe00902c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x9]\n" + "cmp x20, x10\n" + "csel x20, x20, x10, LT\n" ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" - "add x19, x19, #0x3\n" - ".inst 0xe01c0283 // ld1b { za0h.b[x12, #3] }, p0/Z, [x20, x28]\n" + "add x20, x20, #0x3\n" + ".inst 0xe00902a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x9]\n" "mov x12, #0x0\n" - "lsr x19, x19, #0x2\n" + "lsr x20, x20, #0x2\n" "4:" // Stores: Loop ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" + ".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0ae8364 // st1w { za1v.s[x12] }, p0/Z, [x27, x14, LSL #2]\n" + ".inst 0xe0af8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x15, LSL #2]\n" ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0ad8768 // st1w { za2v.s[x12] }, p1/Z, [x27, x13, LSL #2]\n" - ".inst 0xe0ab836c // st1w { za3v.s[x12] }, p0/Z, [x27, x11, LSL #2]\n" + ".inst 0xe0ae8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x14, LSL #2]\n" + ".inst 0xe0ad838c // st1w { za3v.s[x12] }, p0/Z, [x28, x13, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" - "addvl x27, x27, #4\n" + "cmp x12, x20\n" + "addvl x28, x28, #4\n" "blt 4b\n" - "incb x10\n" - "whilelt p9.b, x10, %x[width]\n" - "whilelt p8.b, x10, %x[width]\n" - "incb x28\n" + "incb x11\n" + "whilelt p9.b, x11, %x[width]\n" + "whilelt p8.b, x11, %x[width]\n" + "incb x9\n" "b.any 1b\n" - "mov %x[out], x27\n" + "mov %x[out], x28\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp index 961008a3f2..49d2acf1cd 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -32,118 +32,118 @@ void interleave_block<4, 4, VLType::SME, true>( { __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x16\n" + "cntw x15\n" "mov z24.b, #0x1\n" + "cntw x14, ALL, MUL #2\n" + "cntw x13, ALL, MUL #3\n" "mov z23.s, #0x0\n" - "ptrue p2.b\n" "mov z22.s, #0x0\n" - "cntw x16\n" + "cmp %x[height], x16\n" + "csel x16, %x[height], x16, LT\n" "mov z21.s, #0x0\n" - "cntw x15, ALL, MUL #2\n" "mov z20.s, #0x0\n" - "cntw x14, ALL, MUL #3\n" + "whilelt p12.b, XZR, %x[height]\n" + "whilelt p10.b, x15, %x[height]\n" + "whilelt p9.b, x14, %x[height]\n" + "whilelt p8.b, x13, %x[height]\n" + "zip1 p12.b, p12.b, p9.b\n" + "zip1 p10.b, p10.b, p8.b\n" + "ptrue p2.b\n" "cntb x11\n" "ptrue p11.s\n" - "cntw x10\n" - "cmp %x[height], x10\n" - "csel x10, %x[height], x10, LT\n" - "sub x10, x10, #0x1\n" - "whilelt p10.b, XZR, %x[height]\n" - "whilelt p9.b, x16, %x[height]\n" - "whilelt p8.b, x15, %x[height]\n" - "zip1 p10.b, p10.b, p8.b\n" - "whilelt p8.b, x14, %x[height]\n" - "zip1 p9.b, p9.b, p8.b\n" - "mov x9, %x[row_offset]\n" - "mov x28, %x[out]\n" - "zip1 p10.b, p10.b, p9.b\n" + "sub x16, x16, #0x1\n" + "zip1 p10.b, p12.b, p10.b\n" + "mov x10, %x[row_offset]\n" + "mov x9, %x[out]\n" "cbnz %x[first], 1f\n" - "addvl x28, x28, #-4\n" - "ld1w { z23.s }, p2/Z, [x28]\n" - "ld1w { z22.s }, p2/Z, [x28, #1, MUL VL]\n" - "ld1w { z21.s }, p2/Z, [x28, #2, MUL VL]\n" - "ld1w { z20.s }, p2/Z, [x28, #3, MUL VL]\n" + "addvl x9, x9, #-4\n" + "ld1w { z23.s }, p2/Z, [x9]\n" + "ld1w { z22.s }, p2/Z, [x9, #1, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x9, #3, MUL VL]\n" "1:" // Initialise row sums: End - "mov x27, #0x0\n" - "whilelt p9.b, x27, %x[width]\n" - "whilelt p8.b, x27, %x[width]\n" + "mov x28, #0x0\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "2:" // Width loop - "mov x13, #0x0\n" - "add x26, %x[in], XZR, LSL #3\n" - "add x25, %x[in], x16, LSL #3\n" - "add x24, %x[in], x15, LSL #3\n" - "add x23, %x[in], x14, LSL #3\n" - "ldr x22, [x26], #0x8\n" - "ldr x21, [x25], #0x8\n" - "ldr x19, [x24], #0x8\n" - "ldr x20, [x23], #0x8\n" - "cbz x10, 4f\n" + "add x27, %x[in], XZR, LSL #3\n" + "add x26, %x[in], x15, LSL #3\n" + "ldr x25, [x27], #0x8\n" + "add x24, %x[in], x14, LSL #3\n" + "add x23, %x[in], x13, LSL #3\n" + "ldr x20, [x26], #0x8\n" + "mov x12, #0x0\n" + "ldr x22, [x24], #0x8\n" + "ldr x21, [x23], #0x8\n" + "cbz x16, 4f\n" "3:" // Loads: Loop - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe00922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x9]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe00922a1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x21, x9]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" - "ldr x22, [x26], #0x8\n" - ".inst 0xe0092662 // ld1b { za0h.b[x13, #2] }, p1/Z, [x19, x9]\n" - "ldr x21, [x25], #0x8\n" - "ldr x19, [x24], #0x8\n" - ".inst 0xe0092283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x9]\n" - "ldr x20, [x23], #0x8\n" - "add x13, x13, #0x4\n" - "cmp x13, x10, LSL #2\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + "ldr x25, [x27], #0x8\n" + ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n" + ".inst 0x25346141 // psel p1.b, p8.b/Z, p10.b[w12, #2]\n" + ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" + "ldr x20, [x26], #0x8\n" + ".inst 0xe00a06c2 // ld1b { za0h.b[x12, #2] }, p1/Z, [x22, x10]\n" + "ldr x22, [x24], #0x8\n" + ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n" + "add x12, x12, #0x4\n" + "cmp x12, x16, LSL #2\n" + "ldr x21, [x23], #0x8\n" "blt 3b\n" "4:" // Loads: Tail - ".inst 0x25256140 // dup p0.b, p8.b/Z, p10.b[w13]\n" - ".inst 0xe00922c0 // ld1b { za0h.b[x13] }, p0/Z, [x22, x9]\n" - ".inst 0x252d6140 // dup p0.b, p8.b/Z, p10.b[w13, #1]\n" - ".inst 0x25356141 // dup p1.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe00922a1 // ld1b { za0h.b[x13, #1] }, p0/Z, [x21, x9]\n" - ".inst 0x253d6140 // dup p0.b, p8.b/Z, p10.b[w13, #3]\n" + ".inst 0x25246140 // psel p0.b, p8.b/Z, p10.b[w12]\n" + ".inst 0xe00a0320 // ld1b { za0h.b[x12] }, p0/Z, [x25, x10]\n" + ".inst 0x252c6140 // psel p0.b, p8.b/Z, p10.b[w12, #1]\n" + ".inst 0xe00a0281 // ld1b { za0h.b[x12, #1] }, p0/Z, [x20, x10]\n" + ".inst 0x25346140 // psel p0.b, p8.b/Z, p10.b[w12, #2]\n" + "sub x20, %x[width], x28\n" + ".inst 0xe00a02c2 // ld1b { za0h.b[x12, #2] }, p0/Z, [x22, x10]\n" + "cmp x20, x11\n" + "csel x20, x20, x11, LT\n" + ".inst 0x253c6140 // psel p0.b, p8.b/Z, p10.b[w12, #3]\n" + "add x20, x20, #0x3\n" + ".inst 0xe00a02a3 // ld1b { za0h.b[x12, #3] }, p0/Z, [x21, x10]\n" "mov x12, #0x0\n" - ".inst 0xe0092662 // ld1b { za0h.b[x13, #2] }, p1/Z, [x19, x9]\n" - "sub x19, %x[width], x27\n" - "cmp x19, x11\n" - ".inst 0xe0092283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x9]\n" - "csel x19, x19, x11, LT\n" - "add x19, x19, #0x3\n" - "lsr x19, x19, #0x2\n" + "lsr x20, x20, #0x2\n" "5:" // Stores: Loop + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xc0828813 // mova z19.s, p2/M, za0v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n" - ".inst 0xc0828892 // mova z18.s, p2/M, za1v.s[x12]\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828911 // mova z17.s, p2/M, za2v.s[x12]\n" - ".inst 0xe0b08384 // st1w { za1v.s[x12] }, p0/Z, [x28, x16, LSL #2]\n" - ".inst 0x25306d21 // dup p1.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" + ".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n" + ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" + ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" + ".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n" + ".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n" + ".inst 0xc0828912 // mova z18.s, p2/M, za2v.s[x12]\n" "udot z23.s, z19.b, z24.b\n" - ".inst 0x25306d20 // dup p0.s, p11.s/Z, p9.s[w12]\n" - "udot z22.s, z18.b, z24.b\n" - ".inst 0xe0af8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x15, LSL #2]\n" - "udot z21.s, z17.b, z24.b\n" - "udot z20.s, z16.b, z24.b\n" - ".inst 0xe0ae838c // st1w { za3v.s[x12] }, p0/Z, [x28, x14, LSL #2]\n" - "addvl x28, x28, #4\n" + ".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n" + ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" + "cmp x12, x20\n" + "udot z22.s, z17.b, z24.b\n" + "udot z21.s, z18.b, z24.b\n" + "addvl x9, x9, #4\n" + "udot z20.s, z16.b, z24.b\n" "blt 5b\n" - "incb x9\n" - "incb x27\n" - "whilelt p9.b, x27, %x[width]\n" - "whilelt p8.b, x27, %x[width]\n" + "incb x28\n" + "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" + "incb x10\n" "b.any 2b\n" - "st1w { z23.s }, p2, [x28]\n" - "st1w { z22.s }, p2, [x28, #1, MUL VL]\n" - "st1w { z21.s }, p2, [x28, #2, MUL VL]\n" - "st1w { z20.s }, p2, [x28, #3, MUL VL]\n" - "addvl x28, x28, #4\n" - "mov %x[out], x28\n" + "st1w { z23.s }, p2, [x9]\n" + "st1w { z22.s }, p2, [x9, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z20.s }, p2, [x9, #3, MUL VL]\n" + "addvl x9, x9, #4\n" + "mov %x[out], x9\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p8", "p9", "p10", "p11", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp index 141ab00a52..9579263204 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #if defined(__ARM_FEATURE_SVE) @@ -34,92 +34,92 @@ void interleave_block<4, 1, VLType::SME, false>( __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" + "cntw x15\n" + "cmp %x[height], x15\n" "cntw x14\n" "cntw x13, ALL, MUL #2\n" "cntw x11, ALL, MUL #3\n" - "ptrue p3.s\n" - "cntw x10\n" - "cmp %x[height], x10\n" - "csel x10, %x[height], x10, LT\n" - "sub x10, x10, #0x1\n" - "whilelt p2.s, XZR, %x[height]\n" + "csel x15, %x[height], x15, LT\n" + "mov x10, #0x0\n" + "ptrue p4.s\n" + "sub x15, x15, #0x1\n" + "whilelt p3.s, XZR, %x[height]\n" "whilelt p15.s, x14, %x[height]\n" "whilelt p14.s, x13, %x[height]\n" "whilelt p13.s, x11, %x[height]\n" "mov x9, %x[row_offset]\n" "mov x28, %x[out]\n" - "mov x27, #0x0\n" - "whilelt p12.s, x27, %x[width]\n" - "whilelt p11.s, x27, %x[width]\n" - "whilelt p10.s, x27, %x[width]\n" - "whilelt p9.s, x27, %x[width]\n" - "whilelt p8.s, x27, %x[width]\n" + "whilelt p12.s, x10, %x[width]\n" + "whilelt p11.s, x10, %x[width]\n" + "whilelt p10.s, x10, %x[width]\n" + "whilelt p9.s, x10, %x[width]\n" + "whilelt p8.s, x10, %x[width]\n" "1:" // Width loop - "mov x12, #0x0\n" - "add x26, %x[in], XZR, LSL #3\n" - "add x25, %x[in], x14, LSL #3\n" + "add x27, %x[in], XZR, LSL #3\n" + "add x26, %x[in], x14, LSL #3\n" + "ldr x25, [x27], #0x8\n" "add x24, %x[in], x13, LSL #3\n" - "add x23, %x[in], x11, LSL #3\n" - "ldr x22, [x26], #0x8\n" - "ldr x21, [x25], #0x8\n" - "ldr x20, [x24], #0x8\n" - "ldr x19, [x23], #0x8\n" - "cbz x10, 3f\n" + "add x20, %x[in], x11, LSL #3\n" + "ldr x23, [x26], #0x8\n" + "mov x12, #0x0\n" + "ldr x22, [x24], #0x8\n" + "ldr x21, [x20], #0x8\n" + "cbz x15, 3f\n" "2:" // Loads: Loop - ".inst 0x25306c40 // dup p0.s, p11.s/Z, p2.s[w12]\n" - ".inst 0xe08902c0 // ld1w { za0h.s[x12] }, p0/Z, [x22, x9, LSL #2]\n" - ".inst 0x253069e0 // dup p0.s, p10.s/Z, p15.s[w12]\n" - ".inst 0xe08902a4 // ld1w { za1h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n" - ".inst 0x253065c0 // dup p0.s, p9.s/Z, p14.s[w12]\n" - ".inst 0xe0890288 // ld1w { za2h.s[x12] }, p0/Z, [x20, x9, LSL #2]\n" - ".inst 0x253061a0 // dup p0.s, p8.s/Z, p13.s[w12]\n" - ".inst 0xe089026c // ld1w { za3h.s[x12] }, p0/Z, [x19, x9, LSL #2]\n" - "ldr x22, [x26], #0x8\n" - "ldr x21, [x25], #0x8\n" - "ldr x20, [x24], #0x8\n" - "ldr x19, [x23], #0x8\n" + ".inst 0x25306c60 // psel p0.s, p11.s/Z, p3.s[w12]\n" + ".inst 0x253069e2 // psel p2.s, p10.s/Z, p15.s[w12]\n" + ".inst 0xe0890320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x9, LSL #2]\n" + "ldr x25, [x27], #0x8\n" + ".inst 0x253065c1 // psel p1.s, p9.s/Z, p14.s[w12]\n" + ".inst 0x253061a0 // psel p0.s, p8.s/Z, p13.s[w12]\n" + ".inst 0xe0890ae4 // ld1w { za1h.s[x12] }, p2/Z, [x23, x9, LSL #2]\n" + "ldr x23, [x26], #0x8\n" + ".inst 0xe08906c8 // ld1w { za2h.s[x12] }, p1/Z, [x22, x9, LSL #2]\n" + "ldr x22, [x24], #0x8\n" + ".inst 0xe08902ac // ld1w { za3h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n" "add x12, x12, #0x1\n" - "cmp x12, x10\n" + "cmp x12, x15\n" + "ldr x21, [x20], #0x8\n" "blt 2b\n" "3:" // Loads: Tail - ".inst 0x25306c40 // dup p0.s, p11.s/Z, p2.s[w12]\n" - ".inst 0xe08902c0 // ld1w { za0h.s[x12] }, p0/Z, [x22, x9, LSL #2]\n" - ".inst 0x253069e0 // dup p0.s, p10.s/Z, p15.s[w12]\n" - ".inst 0xe08902a4 // ld1w { za1h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n" - ".inst 0x253065c0 // dup p0.s, p9.s/Z, p14.s[w12]\n" - ".inst 0xe0890288 // ld1w { za2h.s[x12] }, p0/Z, [x20, x9, LSL #2]\n" - ".inst 0x253061a0 // dup p0.s, p8.s/Z, p13.s[w12]\n" - ".inst 0xe089026c // ld1w { za3h.s[x12] }, p0/Z, [x19, x9, LSL #2]\n" + "sub x20, %x[width], x10\n" + ".inst 0x25306c60 // psel p0.s, p11.s/Z, p3.s[w12]\n" + ".inst 0xe0890320 // ld1w { za0h.s[x12] }, p0/Z, [x25, x9, LSL #2]\n" + ".inst 0x253069e0 // psel p0.s, p10.s/Z, p15.s[w12]\n" + ".inst 0x253065c1 // psel p1.s, p9.s/Z, p14.s[w12]\n" + ".inst 0xe08902e4 // ld1w { za1h.s[x12] }, p0/Z, [x23, x9, LSL #2]\n" + ".inst 0x253061a0 // psel p0.s, p8.s/Z, p13.s[w12]\n" + "cmp x20, x14\n" + ".inst 0xe08906c8 // ld1w { za2h.s[x12] }, p1/Z, [x22, x9, LSL #2]\n" + ".inst 0xe08902ac // ld1w { za3h.s[x12] }, p0/Z, [x21, x9, LSL #2]\n" "mov x12, #0x0\n" - "sub x19, %x[width], x27\n" - "cmp x19, x14\n" - "csel x19, x19, x14, LT\n" + "csel x20, x20, x14, LT\n" "4:" // Stores: Loop - ".inst 0x25304d80 // dup p0.s, p3.s/Z, p12.s[w12]\n" + ".inst 0x25305180 // psel p0.s, p4.s/Z, p12.s[w12]\n" ".inst 0xe0bf8380 // st1w { za0v.s[x12] }, p0/Z, [x28, XZR, LSL #2]\n" - ".inst 0x25304d80 // dup p0.s, p3.s/Z, p12.s[w12]\n" - ".inst 0x25304d81 // dup p1.s, p3.s/Z, p12.s[w12]\n" + ".inst 0x25305180 // psel p0.s, p4.s/Z, p12.s[w12]\n" ".inst 0xe0ae8384 // st1w { za1v.s[x12] }, p0/Z, [x28, x14, LSL #2]\n" - ".inst 0x25304d80 // dup p0.s, p3.s/Z, p12.s[w12]\n" + ".inst 0x25305181 // psel p1.s, p4.s/Z, p12.s[w12]\n" + ".inst 0x25305180 // psel p0.s, p4.s/Z, p12.s[w12]\n" ".inst 0xe0ad8788 // st1w { za2v.s[x12] }, p1/Z, [x28, x13, LSL #2]\n" ".inst 0xe0ab838c // st1w { za3v.s[x12] }, p0/Z, [x28, x11, LSL #2]\n" - "addvl x28, x28, #4\n" "add x12, x12, #0x1\n" - "cmp x12, x19\n" + "cmp x12, x20\n" + "addvl x28, x28, #4\n" "blt 4b\n" + "incw x10\n" + "whilelt p12.s, x10, %x[width]\n" + "whilelt p11.s, x10, %x[width]\n" + "whilelt p10.s, x10, %x[width]\n" + "whilelt p9.s, x10, %x[width]\n" + "whilelt p8.s, x10, %x[width]\n" "incw x9\n" - "incw x27\n" - "whilelt p12.s, x27, %x[width]\n" - "whilelt p11.s, x27, %x[width]\n" - "whilelt p10.s, x27, %x[width]\n" - "whilelt p9.s, x27, %x[width]\n" - "whilelt p8.s, x27, %x[width]\n" "b.any 1b\n" "mov %x[out], x28\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) - : "cc", "memory", "p0", "p1", "p2", "p3", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } -- cgit v1.2.1