aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp283
1 files changed, 142 insertions, 141 deletions
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 76828a911e..f8984c451c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -22,14 +22,14 @@
* SOFTWARE.
*/
-#if defined(__aarch64__)
-
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -105,7 +105,7 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -121,42 +121,42 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
- "ldr q25, [x22, x24]\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q28, [x21, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x22, x25]\n"
- "ldr q26, [x21, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "uaddl v17.8h, v25.8b, v24.8b\n"
+ "uaddl2 v16.8h, v25.16b, v24.16b\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddl v17.8h, v25.8b, v24.8b\n"
- "uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x22, x24]\n"
- "add x20, x20, #0x10\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "ldr q24, [x21, x24]\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -196,23 +196,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q29, [x22, x26]\n"
- "ldr q27, [x22, x25]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "ldr q25, [x22, x24]\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "subs x21, x21, #0x1\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -330,49 +330,49 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
"movi v14.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
- "add x20, x20, #0x10\n"
- "ldr q30, [x21, x27]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldp x22, x21, [x20, #0x0]\n"
- "ldr q31, [x22, x27]\n"
- "ldr q30, [x21, x27]\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x20, x20, #0x10\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x22, [x20], #0x8\n"
- "ldr q31, [x22, x27]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -397,9 +397,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
"add x27, x27, #0x10\n"
"bge 8b\n"
@@ -411,142 +411,142 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v14.4s, #0x0\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "mov x20, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
- "ldp x22, x21, [x20, #0x0]\n"
- "add x20, x20, #0x10\n"
- "add x22, x22, x27\n"
- "movi v31.16b, #0x0\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
"add x21, x21, x27\n"
+ "movi v31.16b, #0x0\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d31, [x22], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
- "ld1 { v30.h }[6], [x21], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
+ "ld1 { v30.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
- "ld1 { v30.b }[14], [x21], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
+ "ld1 { v30.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
- "ld1 { v30.b }[12], [x21], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
+ "ld1 { v30.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
- "ld1 { v30.h }[4], [x21], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
+ "ld1 { v30.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
- "ld1 { v30.b }[10], [x21], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
+ "ld1 { v30.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 2 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
- "ld1 { v30.b }[8], [x21], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
+ "ld1 { v30.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 2 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s31, [x22], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
- "ld1 { v30.b }[6], [x21], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
+ "ld1 { v30.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
- "ld1 { v30.b }[4], [x21], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
+ "ld1 { v30.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h31, [x22], #0x2\n"
- "ldr h30, [x21], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
+ "ldr h30, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
- "ld1 { v30.b }[2], [x21], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
+ "ld1 { v30.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 2 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b31, [x22], #0x1\n"
- "ldr b30, [x21], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
+ "ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
"subs x23, x23, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x21, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "ldr x22, [x20], #0x8\n"
- "add x22, x22, x27\n"
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d31, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v31.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v31.b }[14], [x21], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v31.b }[12], [x21], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v31.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v31.b }[10], [x21], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v31.b }[8], [x21], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s31, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v31.b }[6], [x21], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v31.b }[4], [x21], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h31, [x22], #0x2\n"
+ "ldr h31, [x21], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v31.b }[2], [x21], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b31, [x22], #0x1\n"
+ "ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "subs x21, x21, #0x1\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"ld1r { v17.4s }, [%x[rescale_ptr]]\n"
@@ -569,9 +569,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
"smin v12.4s, v12.4s, v16.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -626,4 +626,5 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
} // namespace pooling
} // namespace arm_conv
+
#endif // defined(__aarch64__)