aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2020-03-10 15:33:57 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-03-10 18:00:14 +0000
commitce3a7b27f80960e88415bb6cabbb75de2239cea8 (patch)
treee7d6021996a62632c08f6cce81f73467754530e1
parenta26e166829f4d4c48864b1b7243e4e267373d0fd (diff)
downloadComputeLibrary-ce3a7b27f80960e88415bb6cabbb75de2239cea8.tar.gz
COMPMID-3259: Fix scalar register allocation
The Aarch64 ABI reserves X18 for platform ABIs, replace all references to X18 with a different register which doesn't have a special purpose. Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: Ia9e059d44c5edda216bea169d0418bb7a8c4311b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2863 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Nikhil Raj Arm <nikhil.raj@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp138
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp70
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp344
-rw-r--r--src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp42
4 files changed, 297 insertions, 297 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 2508ec7aeb..4661373e12 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -54,7 +54,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"add x16, x15, #64\n"
"add x17, x15, %[input_col_stride1]\n"
"add x23, x22, %[input_row_stride]\n"
- "add x18, x17, #64\n"
+ "add x9, x17, #64\n"
"add x25, x24, %[output_row_stride]\n"
"add x26, %[output_col_stride1], %[output_col_stride1]\n"
"and x27, %[n_channels], #3\n"
@@ -133,7 +133,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -143,13 +143,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr q21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -159,9 +159,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr q19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -305,7 +305,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -315,13 +315,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr q21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -331,9 +331,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr q19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -479,7 +479,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -489,13 +489,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr s21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -505,9 +505,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr s19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -651,7 +651,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -661,13 +661,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr s21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -677,9 +677,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr s19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -754,7 +754,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"7:\n"
: [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
@@ -780,9 +780,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"add x26, x16, %[input_col_stride1]\n"
"add x13, x22, %[input_row_stride]\n"
"add x20, x26, #64\n"
- "add x18, x26, %[input_col_stride1]\n"
+ "add x9, x26, %[input_col_stride1]\n"
"add x24, x13, %[input_row_stride]\n"
- "add x15, x18, #64\n"
+ "add x15, x9, #64\n"
"add x14, x21, %[output_row_stride]\n"
"add x19, %[output_col_stride1], %[output_col_stride1]\n"
"and x27, %[n_channels], #3\n"
@@ -854,7 +854,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr q20, [%[inptr0], x18]\n"
+ "ldr q20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -886,7 +886,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr q22, [x25, x18]\n"
+ "ldr q22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr q21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -912,7 +912,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr q26, [x22, x18]\n"
+ "ldr q26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr q18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -920,11 +920,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"prfm pldl1keep, [x22, #64]\n"
"fmla v8.4s, v23.4s, v0.4s\n"
- "ldr q23, [x13, x18]\n"
+ "ldr q23, [x13, x9]\n"
"fmla v7.4s, v28.4s, v10.4s\n"
"prfm pldl1keep, [x22, x17]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
- "ldr q25, [x24, x18]\n"
+ "ldr q25, [x24, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
"ldr q20, [%[wbptr]]\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1036,7 +1036,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr q20, [%[inptr0], x18]\n"
+ "ldr q20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -1068,7 +1068,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr q22, [x25, x18]\n"
+ "ldr q22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr q21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -1088,7 +1088,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr q26, [x22, x18]\n"
+ "ldr q26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr q18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -1096,9 +1096,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"fmla v8.4s, v23.4s, v0.4s\n"
"fmla v7.4s, v28.4s, v10.4s\n"
- "ldr q23, [x13, x18]\n"
+ "ldr q23, [x13, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
- "ldr q25, [x24, x18]\n"
+ "ldr q25, [x24, x9]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
"add x13, x13, #16\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1220,7 +1220,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr s20, [%[inptr0], x18]\n"
+ "ldr s20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -1252,7 +1252,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr s22, [x25, x18]\n"
+ "ldr s22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr s21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -1278,7 +1278,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr s26, [x22, x18]\n"
+ "ldr s26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr s18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -1286,11 +1286,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"prfm pldl1keep, [x22, #64]\n"
"fmla v8.4s, v23.4s, v0.4s\n"
- "ldr s23, [x13, x18]\n"
+ "ldr s23, [x13, x9]\n"
"fmla v7.4s, v28.4s, v10.4s\n"
"prfm pldl1keep, [x22, x17]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
- "ldr s25, [x24, x18]\n"
+ "ldr s25, [x24, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
"ldr s20, [%[wbptr]]\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1402,7 +1402,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr s20, [%[inptr0], x18]\n"
+ "ldr s20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -1434,7 +1434,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr s22, [x25, x18]\n"
+ "ldr s22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr s21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -1454,7 +1454,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr s26, [x22, x18]\n"
+ "ldr s26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr s18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -1462,9 +1462,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"fmla v8.4s, v23.4s, v0.4s\n"
"fmla v7.4s, v28.4s, v10.4s\n"
- "ldr s23, [x13, x18]\n"
+ "ldr s23, [x13, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
- "ldr s25, [x24, x18]\n"
+ "ldr s25, [x24, x9]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
"add x13, x13, #4\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1522,7 +1522,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"7:\n"
: [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
@@ -1541,11 +1541,11 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
{
__asm __volatile(
"add x17, %[inptr0], %[input_row_stride]\n"
- "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x9, %[input_col_stride1], %[input_col_stride1]\n"
"add x25, %[outptr0], %[output_row_stride]\n"
"add x14, x17, %[input_row_stride]\n"
- "add x22, x18, #64\n"
- "add x15, x18, %[input_col_stride1]\n"
+ "add x22, x9, #64\n"
+ "add x15, x9, %[input_col_stride1]\n"
"add x21, x14, %[input_row_stride]\n"
"add x16, x15, #64\n"
"add x24, x15, %[input_col_stride1]\n"
@@ -1583,7 +1583,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"ldr q26, [%[inptr0], %[input_col_stride1]]\n"
"ldr q20, [x14]\n"
"ldr q22, [x17, %[input_col_stride1]]\n"
- "ldr q28, [%[inptr0], x18]\n"
+ "ldr q28, [%[inptr0], x9]\n"
"ldr q23, [x21]\n"
"fmla v8.4s, v27.4s, v14.4s\n"
"ldr q18, [x14, %[input_col_stride1]]\n"
@@ -1598,7 +1598,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"beq 3f\n"
"2:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr q27, [x17, x18]\n"
+ "ldr q27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr q30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -1608,7 +1608,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr q29, [x14, x18]\n"
+ "ldr q29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -1638,7 +1638,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr q25, [x21, x18]\n"
+ "ldr q25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -1656,7 +1656,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr q26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr q20, [x26, x18]\n"
+ "ldr q20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -1754,7 +1754,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmin v5.4s, v5.4s, v29.4s\n"
"ldr q12, [%[wbptr], #96]\n"
"fmax v4.4s, v4.4s, v30.4s\n"
- "ldr q28, [%[inptr0], x18]\n"
+ "ldr q28, [%[inptr0], x9]\n"
"str q5, [x25]\n"
"fmax v3.4s, v3.4s, v30.4s\n"
"fmin v4.4s, v4.4s, v29.4s\n"
@@ -1790,7 +1790,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"bne 2b\n"
"3:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr q27, [x17, x18]\n"
+ "ldr q27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr q30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -1800,7 +1800,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr q29, [x14, x18]\n"
+ "ldr q29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -1830,7 +1830,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr q25, [x21, x18]\n"
+ "ldr q25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -1848,7 +1848,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr q26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr q20, [x26, x18]\n"
+ "ldr q20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -1969,7 +1969,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"ldr s26, [%[inptr0], %[input_col_stride1]]\n"
"ldr s20, [x14]\n"
"ldr s22, [x17, %[input_col_stride1]]\n"
- "ldr s28, [%[inptr0], x18]\n"
+ "ldr s28, [%[inptr0], x9]\n"
"fmla v8.4s, v27.4s, v14.4s\n"
"ldr s23, [x21]\n"
"ldr s18, [x14, %[input_col_stride1]]\n"
@@ -1984,7 +1984,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"beq 6f\n"
"5:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr s27, [x17, x18]\n"
+ "ldr s27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr s30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -1994,7 +1994,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr s29, [x14, x18]\n"
+ "ldr s29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -2024,7 +2024,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr s25, [x21, x18]\n"
+ "ldr s25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -2042,7 +2042,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr s26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr s20, [x26, x18]\n"
+ "ldr s20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -2140,7 +2140,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmin v5.4s, v5.4s, v29.4s\n"
"ldr s12, [%[wbptr], #24]\n"
"fmax v4.4s, v4.4s, v30.4s\n"
- "ldr s28, [%[inptr0], x18]\n"
+ "ldr s28, [%[inptr0], x9]\n"
"str s5, [x25]\n"
"fmax v3.4s, v3.4s, v30.4s\n"
"fmin v4.4s, v4.4s, v29.4s\n"
@@ -2176,7 +2176,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"bne 5b\n"
"6:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr s27, [x17, x18]\n"
+ "ldr s27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr s30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -2186,7 +2186,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr s29, [x14, x18]\n"
+ "ldr s29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -2216,7 +2216,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr s25, [x21, x18]\n"
+ "ldr s25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -2234,7 +2234,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr s26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr s20, [x26, x18]\n"
+ "ldr s20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -2330,7 +2330,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"7:\n"
: [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
: [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index adc6969fcb..eb2b37a5b0 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -53,10 +53,10 @@ void Conv::execute_tile<ActivationFunction::None>(
"add x17, x16, %[input_row_stride]\n"
"add x28, x27, %[input_col_stride1]\n"
"add x23, %[output_col_stride1], %[output_col_stride1]\n"
- "add x18, x17, %[input_row_stride]\n"
+ "add x9, x17, %[input_row_stride]\n"
"add x13, x28, %[input_col_stride1]\n"
"and x24, %[n_channels], #3\n"
- "add x19, x18, %[input_row_stride]\n"
+ "add x19, x9, %[input_row_stride]\n"
"add x14, x13, %[input_col_stride1]\n"
"lsr x25, %[n_channels], #2\n"
"add x20, x19, %[input_row_stride]\n"
@@ -99,7 +99,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v19.4s, v6.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v17.4s, v28.4s, v12.4s\n"
- "ldr q25, [x18]\n"
+ "ldr q25, [x9]\n"
"fmla v16.4s, v30.4s, v12.4s\n"
"ldr q24, [x17, %[input_col_stride1]]\n"
"fmla v15.4s, v21.4s, v12.4s\n"
@@ -117,7 +117,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v23.4s, v9.4s\n"
"ldr q30, [x19]\n"
"fmla v17.4s, v20.4s, v11.4s\n"
- "ldr q29, [x18, %[input_col_stride1]]\n"
+ "ldr q29, [x9, %[input_col_stride1]]\n"
"fmla v16.4s, v24.4s, v11.4s\n"
"ldr q28, [x17, x26]\n"
"fmla v4.4s, v23.4s, v6.4s\n"
@@ -135,7 +135,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v16.4s, v29.4s, v8.4s\n"
"ldr q22, [x19, %[input_col_stride1]]\n"
"fmla v17.4s, v21.4s, v10.4s\n"
- "ldr q26, [x18, x26]\n"
+ "ldr q26, [x9, x26]\n"
"fmla v2.4s, v29.4s, v14.4s\n"
"ldr q20, [x17, x27]\n"
"fmla v16.4s, v28.4s, v10.4s\n"
@@ -153,7 +153,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v18.4s, v9.4s\n"
"ldr q17, [x19, x26]\n"
"fmla v5.4s, v19.4s, v14.4s\n"
- "ldr q18, [x18, x27]\n"
+ "ldr q18, [x9, x27]\n"
"fmla v16.4s, v26.4s, v7.4s\n"
"ldr q25, [x17, x28]\n"
"fmla v2.4s, v22.4s, v11.4s\n"
@@ -175,7 +175,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"str q15, [%[outptr0], %[output_col_stride1]]\n"
"fmla v2.4s, v28.4s, v8.4s\n"
"fmla v5.4s, v30.4s, v11.4s\n"
- "ldr q29, [x18, x28]\n"
+ "ldr q29, [x9, x28]\n"
"fmla v1.4s, v17.4s, v12.4s\n"
"ldr q27, [x17, x13]\n"
"fmla v2.4s, v17.4s, v10.4s\n"
@@ -185,7 +185,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v4.4s, v18.4s, v8.4s\n"
"ldr q20, [x19, x28]\n"
"fmla v1.4s, v18.4s, v14.4s\n"
- "ldr q17, [x18, x13]\n"
+ "ldr q17, [x9, x13]\n"
"fmla v3.4s, v25.4s, v12.4s\n"
"ldr q18, [x17, x14]\n"
"fmla v4.4s, v25.4s, v10.4s\n"
@@ -197,7 +197,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v23.4s, v7.4s\n"
"add x17, x17, #16\n"
"fmla v5.4s, v21.4s, v10.4s\n"
- "ldr q21, [x18, x14]\n"
+ "ldr q21, [x9, x14]\n"
"fmla v1.4s, v23.4s, v9.4s\n"
"ldr q23, [x20, x13]\n"
"str q2, [x22]\n"
@@ -241,7 +241,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"str q3, [x21, x23]\n"
"mov v16.16b, v27.16b\n"
"mov v15.16b, v27.16b\n"
- "add x18, x18, #16\n"
+ "add x9, x9, #16\n"
"fmla v0.4s, v21.4s, v13.4s\n"
"ldr q11, [%[wbptr], #80]\n"
"mov v2.16b, v27.16b\n"
@@ -272,7 +272,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v19.4s, v6.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v17.4s, v28.4s, v12.4s\n"
- "ldr q25, [x18]\n"
+ "ldr q25, [x9]\n"
"fmla v16.4s, v30.4s, v12.4s\n"
"ldr q24, [x17, %[input_col_stride1]]\n"
"fmla v15.4s, v21.4s, v12.4s\n"
@@ -290,7 +290,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v23.4s, v9.4s\n"
"fmla v4.4s, v23.4s, v6.4s\n"
"fmla v17.4s, v20.4s, v11.4s\n"
- "ldr q29, [x18, %[input_col_stride1]]\n"
+ "ldr q29, [x9, %[input_col_stride1]]\n"
"fmla v16.4s, v24.4s, v11.4s\n"
"ldr q28, [x17, x26]\n"
"fmla v15.4s, v26.4s, v11.4s\n"
@@ -308,7 +308,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v16.4s, v29.4s, v8.4s\n"
"fmla v4.4s, v28.4s, v12.4s\n"
"fmla v17.4s, v21.4s, v10.4s\n"
- "ldr q26, [x18, x26]\n"
+ "ldr q26, [x9, x26]\n"
"fmla v2.4s, v29.4s, v14.4s\n"
"ldr q20, [x17, x27]\n"
"fmla v16.4s, v28.4s, v10.4s\n"
@@ -326,7 +326,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v5.4s, v19.4s, v14.4s\n"
"ldr q17, [x19, x26]\n"
"fmla v2.4s, v22.4s, v11.4s\n"
- "ldr q18, [x18, x27]\n"
+ "ldr q18, [x9, x27]\n"
"fmla v16.4s, v26.4s, v7.4s\n"
"ldr q25, [x17, x28]\n"
"fmla v4.4s, v26.4s, v9.4s\n"
@@ -346,7 +346,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"str q15, [%[outptr0], %[output_col_stride1]]\n"
"fmla v3.4s, v27.4s, v6.4s\n"
"fmla v5.4s, v30.4s, v11.4s\n"
- "ldr q29, [x18, x28]\n"
+ "ldr q29, [x9, x28]\n"
"fmla v2.4s, v28.4s, v8.4s\n"
"ldr q27, [x17, x13]\n"
"fmla v1.4s, v17.4s, v12.4s\n"
@@ -356,7 +356,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v17.4s, v10.4s\n"
"ldr q20, [x19, x28]\n"
"fmla v4.4s, v18.4s, v8.4s\n"
- "ldr q17, [x18, x13]\n"
+ "ldr q17, [x9, x13]\n"
"fmla v1.4s, v18.4s, v14.4s\n"
"ldr q18, [x17, x14]\n"
"fmla v3.4s, v25.4s, v12.4s\n"
@@ -370,7 +370,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v23.4s, v7.4s\n"
"fmla v1.4s, v23.4s, v9.4s\n"
"fmla v5.4s, v21.4s, v10.4s\n"
- "ldr q21, [x18, x14]\n"
+ "ldr q21, [x9, x14]\n"
"fmla v4.4s, v29.4s, v7.4s\n"
"ldr q23, [x20, x13]\n"
"str q2, [x22]\n"
@@ -382,7 +382,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v1.4s, v29.4s, v13.4s\n"
"ldr q25, [x20, x14]\n"
"fmla v3.4s, v27.4s, v11.4s\n"
- "add x18, x18, #16\n"
+ "add x9, x9, #16\n"
"fmla v5.4s, v28.4s, v7.4s\n"
"add x19, x19, #16\n"
"fmla v1.4s, v26.4s, v8.4s\n"
@@ -447,7 +447,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v19.4s, v6.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v17.4s, v28.4s, v12.4s\n"
- "ldr s25, [x18]\n"
+ "ldr s25, [x9]\n"
"fmla v16.4s, v30.4s, v12.4s\n"
"ldr s24, [x17, %[input_col_stride1]]\n"
"fmla v15.4s, v21.4s, v12.4s\n"
@@ -465,7 +465,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v23.4s, v9.4s\n"
"ldr s30, [x19]\n"
"fmla v17.4s, v20.4s, v11.4s\n"
- "ldr s29, [x18, %[input_col_stride1]]\n"
+ "ldr s29, [x9, %[input_col_stride1]]\n"
"fmla v16.4s, v24.4s, v11.4s\n"
"ldr s28, [x17, x26]\n"
"fmla v4.4s, v23.4s, v6.4s\n"
@@ -483,7 +483,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v16.4s, v29.4s, v8.4s\n"
"ldr s22, [x19, %[input_col_stride1]]\n"
"fmla v17.4s, v21.4s, v10.4s\n"
- "ldr s26, [x18, x26]\n"
+ "ldr s26, [x9, x26]\n"
"fmla v2.4s, v29.4s, v14.4s\n"
"ldr s20, [x17, x27]\n"
"fmla v16.4s, v28.4s, v10.4s\n"
@@ -501,7 +501,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v18.4s, v9.4s\n"
"ldr s17, [x19, x26]\n"
"fmla v5.4s, v19.4s, v14.4s\n"
- "ldr s18, [x18, x27]\n"
+ "ldr s18, [x9, x27]\n"
"fmla v16.4s, v26.4s, v7.4s\n"
"ldr s25, [x17, x28]\n"
"fmla v2.4s, v22.4s, v11.4s\n"
@@ -523,7 +523,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"str s15, [%[outptr0], %[output_col_stride1]]\n"
"fmla v2.4s, v28.4s, v8.4s\n"
"fmla v5.4s, v30.4s, v11.4s\n"
- "ldr s29, [x18, x28]\n"
+ "ldr s29, [x9, x28]\n"
"fmla v1.4s, v17.4s, v12.4s\n"
"ldr s27, [x17, x13]\n"
"fmla v2.4s, v17.4s, v10.4s\n"
@@ -533,7 +533,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v4.4s, v18.4s, v8.4s\n"
"ldr s20, [x19, x28]\n"
"fmla v1.4s, v18.4s, v14.4s\n"
- "ldr s17, [x18, x13]\n"
+ "ldr s17, [x9, x13]\n"
"fmla v3.4s, v25.4s, v12.4s\n"
"ldr s18, [x17, x14]\n"
"fmla v4.4s, v25.4s, v10.4s\n"
@@ -545,7 +545,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v23.4s, v7.4s\n"
"add x17, x17, #4\n"
"fmla v5.4s, v21.4s, v10.4s\n"
- "ldr s21, [x18, x14]\n"
+ "ldr s21, [x9, x14]\n"
"fmla v1.4s, v23.4s, v9.4s\n"
"ldr s23, [x20, x13]\n"
"str s2, [x22]\n"
@@ -589,7 +589,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"str s3, [x21, x23]\n"
"mov v16.16b, v27.16b\n"
"mov v15.16b, v27.16b\n"
- "add x18, x18, #4\n"
+ "add x9, x9, #4\n"
"fmla v0.4s, v21.4s, v13.4s\n"
"ldr s11, [%[wbptr], #20]\n"
"mov v2.16b, v27.16b\n"
@@ -620,7 +620,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v19.4s, v6.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v17.4s, v28.4s, v12.4s\n"
- "ldr s25, [x18]\n"
+ "ldr s25, [x9]\n"
"fmla v16.4s, v30.4s, v12.4s\n"
"ldr s24, [x17, %[input_col_stride1]]\n"
"fmla v15.4s, v21.4s, v12.4s\n"
@@ -638,7 +638,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v15.4s, v23.4s, v9.4s\n"
"fmla v4.4s, v23.4s, v6.4s\n"
"fmla v17.4s, v20.4s, v11.4s\n"
- "ldr s29, [x18, %[input_col_stride1]]\n"
+ "ldr s29, [x9, %[input_col_stride1]]\n"
"fmla v16.4s, v24.4s, v11.4s\n"
"ldr s28, [x17, x26]\n"
"fmla v15.4s, v26.4s, v11.4s\n"
@@ -656,7 +656,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v16.4s, v29.4s, v8.4s\n"
"fmla v4.4s, v28.4s, v12.4s\n"
"fmla v17.4s, v21.4s, v10.4s\n"
- "ldr s26, [x18, x26]\n"
+ "ldr s26, [x9, x26]\n"
"fmla v2.4s, v29.4s, v14.4s\n"
"ldr s20, [x17, x27]\n"
"fmla v16.4s, v28.4s, v10.4s\n"
@@ -674,7 +674,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v5.4s, v19.4s, v14.4s\n"
"ldr s17, [x19, x26]\n"
"fmla v2.4s, v22.4s, v11.4s\n"
- "ldr s18, [x18, x27]\n"
+ "ldr s18, [x9, x27]\n"
"fmla v16.4s, v26.4s, v7.4s\n"
"ldr s25, [x17, x28]\n"
"fmla v4.4s, v26.4s, v9.4s\n"
@@ -694,7 +694,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"str s15, [%[outptr0], %[output_col_stride1]]\n"
"fmla v3.4s, v27.4s, v6.4s\n"
"fmla v5.4s, v30.4s, v11.4s\n"
- "ldr s29, [x18, x28]\n"
+ "ldr s29, [x9, x28]\n"
"fmla v2.4s, v28.4s, v8.4s\n"
"ldr s27, [x17, x13]\n"
"fmla v1.4s, v17.4s, v12.4s\n"
@@ -704,7 +704,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v17.4s, v10.4s\n"
"ldr s20, [x19, x28]\n"
"fmla v4.4s, v18.4s, v8.4s\n"
- "ldr s17, [x18, x13]\n"
+ "ldr s17, [x9, x13]\n"
"fmla v1.4s, v18.4s, v14.4s\n"
"ldr s18, [x17, x14]\n"
"fmla v3.4s, v25.4s, v12.4s\n"
@@ -718,7 +718,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v23.4s, v7.4s\n"
"fmla v1.4s, v23.4s, v9.4s\n"
"fmla v5.4s, v21.4s, v10.4s\n"
- "ldr s21, [x18, x14]\n"
+ "ldr s21, [x9, x14]\n"
"fmla v4.4s, v29.4s, v7.4s\n"
"ldr s23, [x20, x13]\n"
"str s2, [x22]\n"
@@ -730,7 +730,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v1.4s, v29.4s, v13.4s\n"
"ldr s25, [x20, x14]\n"
"fmla v3.4s, v27.4s, v11.4s\n"
- "add x18, x18, #4\n"
+ "add x9, x9, #4\n"
"fmla v5.4s, v28.4s, v7.4s\n"
"add x19, x19, #4\n"
"fmla v1.4s, v26.4s, v8.4s\n"
@@ -759,7 +759,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"7:\n"
: [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
: [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
);
}
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index a583615c99..d95332b828 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -51,7 +51,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"add x16, x15, #64\n"
"add x17, x15, %[input_col_stride1]\n"
"add x10, x9, %[input_row_stride]\n"
- "add x18, x17, #64\n"
+ "add x7, x17, #64\n"
"add x19, x17, %[input_col_stride1]\n"
"add x11, x10, %[input_row_stride]\n"
"add x20, x19, #64\n"
@@ -111,7 +111,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr q22, [x8, x15]\n"
"fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v17.4s, v30.4s, v11.4s\n"
"ldr q29, [%[inptr0], x17]\n"
"fmla v23.4s, v25.4s, v9.4s\n"
@@ -123,7 +123,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v26.4s, v11.4s\n"
"prfm pldl1keep, [x9, x16]\n"
"fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x18]\n"
+ "prfm pldl1keep, [x8, x7]\n"
"fmla v17.4s, v26.4s, v8.4s\n"
"prfm pldl1keep, [%[inptr0], x20]\n"
"fmla v16.4s, v26.4s, v12.4s\n"
@@ -137,7 +137,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v15.4s, v6.4s\n"
"prfm pldl1keep, [x10, x16]\n"
"fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v0.4s, v15.4s, v12.4s\n"
"ldr q21, [x8, x17]\n"
"fmla v17.4s, v18.4s, v5.4s\n"
@@ -149,7 +149,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v20.4s, v18.4s, v11.4s\n"
"prfm pldl1keep, [x11, x16]\n"
"fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x18]\n"
+ "prfm pldl1keep, [x10, x7]\n"
"fmla v1.4s, v18.4s, v12.4s\n"
"ldr q27, [%[inptr0], x19]\n"
"fmla v17.4s, v22.4s, v7.4s\n"
@@ -159,7 +159,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v22.4s, v8.4s\n"
"prfm pldl1keep, [x12, x16]\n"
"fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x18]\n"
+ "prfm pldl1keep, [x11, x7]\n"
"fmla v13.4s, v22.4s, v9.4s\n"
"prfm pldl1keep, [x10, x20]\n"
"fmla v2.4s, v22.4s, v12.4s\n"
@@ -167,7 +167,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v29.4s, v10.4s\n"
"prfm pldl1keep, [x9, x22]\n"
"fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x18]\n"
+ "prfm pldl1keep, [x12, x7]\n"
"fmla v3.4s, v29.4s, v12.4s\n"
"ldr q22, [x11, %[input_col_stride1]]\n"
"fmla v20.4s, v25.4s, v6.4s\n"
@@ -384,7 +384,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr q22, [x8, x15]\n"
"fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v17.4s, v30.4s, v11.4s\n"
"ldr q29, [%[inptr0], x17]\n"
"fmla v23.4s, v25.4s, v9.4s\n"
@@ -396,7 +396,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v26.4s, v11.4s\n"
"prfm pldl1keep, [x9, x16]\n"
"fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x18]\n"
+ "prfm pldl1keep, [x8, x7]\n"
"fmla v17.4s, v26.4s, v8.4s\n"
"prfm pldl1keep, [%[inptr0], x20]\n"
"fmla v16.4s, v26.4s, v12.4s\n"
@@ -410,7 +410,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v15.4s, v6.4s\n"
"prfm pldl1keep, [x10, x16]\n"
"fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v0.4s, v15.4s, v12.4s\n"
"ldr q21, [x8, x17]\n"
"fmla v17.4s, v18.4s, v5.4s\n"
@@ -422,7 +422,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v20.4s, v18.4s, v11.4s\n"
"prfm pldl1keep, [x11, x16]\n"
"fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x18]\n"
+ "prfm pldl1keep, [x10, x7]\n"
"fmla v1.4s, v18.4s, v12.4s\n"
"ldr q27, [%[inptr0], x19]\n"
"fmla v17.4s, v22.4s, v7.4s\n"
@@ -432,7 +432,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v22.4s, v8.4s\n"
"prfm pldl1keep, [x12, x16]\n"
"fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x18]\n"
+ "prfm pldl1keep, [x11, x7]\n"
"fmla v13.4s, v22.4s, v9.4s\n"
"prfm pldl1keep, [x10, x20]\n"
"fmla v2.4s, v22.4s, v12.4s\n"
@@ -440,7 +440,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v29.4s, v10.4s\n"
"prfm pldl1keep, [x9, x22]\n"
"fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x18]\n"
+ "prfm pldl1keep, [x12, x7]\n"
"fmla v3.4s, v29.4s, v12.4s\n"
"ldr q22, [x11, %[input_col_stride1]]\n"
"fmla v20.4s, v25.4s, v6.4s\n"
@@ -659,7 +659,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr s22, [x8, x15]\n"
"fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v17.4s, v30.4s, v11.4s\n"
"ldr s29, [%[inptr0], x17]\n"
"fmla v23.4s, v25.4s, v9.4s\n"
@@ -671,7 +671,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v26.4s, v11.4s\n"
"prfm pldl1keep, [x9, x16]\n"
"fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x18]\n"
+ "prfm pldl1keep, [x8, x7]\n"
"fmla v17.4s, v26.4s, v8.4s\n"
"prfm pldl1keep, [%[inptr0], x20]\n"
"fmla v16.4s, v26.4s, v12.4s\n"
@@ -685,7 +685,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v15.4s, v6.4s\n"
"prfm pldl1keep, [x10, x16]\n"
"fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v0.4s, v15.4s, v12.4s\n"
"ldr s21, [x8, x17]\n"
"fmla v17.4s, v18.4s, v5.4s\n"
@@ -697,7 +697,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v20.4s, v18.4s, v11.4s\n"
"prfm pldl1keep, [x11, x16]\n"
"fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x18]\n"
+ "prfm pldl1keep, [x10, x7]\n"
"fmla v1.4s, v18.4s, v12.4s\n"
"ldr s27, [%[inptr0], x19]\n"
"fmla v17.4s, v22.4s, v7.4s\n"
@@ -707,7 +707,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v22.4s, v8.4s\n"
"prfm pldl1keep, [x12, x16]\n"
"fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x18]\n"
+ "prfm pldl1keep, [x11, x7]\n"
"fmla v13.4s, v22.4s, v9.4s\n"
"prfm pldl1keep, [x10, x20]\n"
"fmla v2.4s, v22.4s, v12.4s\n"
@@ -715,7 +715,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v29.4s, v10.4s\n"
"prfm pldl1keep, [x9, x22]\n"
"fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x18]\n"
+ "prfm pldl1keep, [x12, x7]\n"
"fmla v3.4s, v29.4s, v12.4s\n"
"ldr s22, [x11, %[input_col_stride1]]\n"
"fmla v20.4s, v25.4s, v6.4s\n"
@@ -932,7 +932,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr s22, [x8, x15]\n"
"fmla v24.4s, v30.4s, v12.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v17.4s, v30.4s, v11.4s\n"
"ldr s29, [%[inptr0], x17]\n"
"fmla v23.4s, v25.4s, v9.4s\n"
@@ -944,7 +944,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v26.4s, v11.4s\n"
"prfm pldl1keep, [x9, x16]\n"
"fmla v24.4s, v26.4s, v9.4s\n"
- "prfm pldl1keep, [x8, x18]\n"
+ "prfm pldl1keep, [x8, x7]\n"
"fmla v17.4s, v26.4s, v8.4s\n"
"prfm pldl1keep, [%[inptr0], x20]\n"
"fmla v16.4s, v26.4s, v12.4s\n"
@@ -958,7 +958,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v23.4s, v15.4s, v6.4s\n"
"prfm pldl1keep, [x10, x16]\n"
"fmla v20.4s, v15.4s, v9.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v0.4s, v15.4s, v12.4s\n"
"ldr s21, [x8, x17]\n"
"fmla v17.4s, v18.4s, v5.4s\n"
@@ -970,7 +970,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v20.4s, v18.4s, v11.4s\n"
"prfm pldl1keep, [x11, x16]\n"
"fmla v16.4s, v18.4s, v9.4s\n"
- "prfm pldl1keep, [x10, x18]\n"
+ "prfm pldl1keep, [x10, x7]\n"
"fmla v1.4s, v18.4s, v12.4s\n"
"ldr s27, [%[inptr0], x19]\n"
"fmla v17.4s, v22.4s, v7.4s\n"
@@ -980,7 +980,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v22.4s, v8.4s\n"
"prfm pldl1keep, [x12, x16]\n"
"fmla v16.4s, v22.4s, v11.4s\n"
- "prfm pldl1keep, [x11, x18]\n"
+ "prfm pldl1keep, [x11, x7]\n"
"fmla v13.4s, v22.4s, v9.4s\n"
"prfm pldl1keep, [x10, x20]\n"
"fmla v2.4s, v22.4s, v12.4s\n"
@@ -988,7 +988,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v24.4s, v29.4s, v10.4s\n"
"prfm pldl1keep, [x9, x22]\n"
"fmla v13.4s, v29.4s, v11.4s\n"
- "prfm pldl1keep, [x12, x18]\n"
+ "prfm pldl1keep, [x12, x7]\n"
"fmla v3.4s, v29.4s, v12.4s\n"
"ldr s22, [x11, %[input_col_stride1]]\n"
"fmla v20.4s, v25.4s, v6.4s\n"
@@ -1163,7 +1163,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"7:\n"
: [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
);
}
@@ -1204,16 +1204,16 @@ void Conv::execute_tile<ActivationFunction::None>(
"mov v2.16b, v13.16b\n"
"ldr q4, [%[wbptr], #144]\n"
"ldr q29, [x17, x27]\n"
- "ldr x18, [%[inptrs], 48]\n"
+ "ldr x7, [%[inptrs], 48]\n"
"fmla v18.4s, v29.4s, v12.4s\n"
"ldr x17, [%[inptrs], 8]\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"ldr x19, [%[inptrs], 96]\n"
"ldr q28, [x17, x27]\n"
- "ldr x18, [%[inptrs], 56]\n"
+ "ldr x7, [%[inptrs], 56]\n"
"ldr q25, [x19, x27]\n"
"ldr x17, [%[inptrs], 16]\n"
- "ldr q16, [x18, x27]\n"
+ "ldr q16, [x7, x27]\n"
"ldr x20, [%[inptrs], 144]\n"
"ldr q15, [x17, x27]\n"
"ldr x19, [%[inptrs], 104]\n"
@@ -1223,11 +1223,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"beq 3f\n"
"2:\n"
"mov v3.16b, v13.16b\n"
- "ldr x18, [%[inptrs], 64]\n"
+ "ldr x7, [%[inptrs], 64]\n"
"fmla v18.4s, v27.4s, v9.4s\n"
"ldr x17, [%[inptrs], 24]\n"
"fmla v22.4s, v27.4s, v12.4s\n"
- "ldr q30, [x18, x27]\n"
+ "ldr q30, [x7, x27]\n"
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr x21, [%[inptrs], 192]\n"
"fmla v19.4s, v25.4s, v12.4s\n"
@@ -1237,7 +1237,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v22.4s, v25.4s, v9.4s\n"
"ldr x19, [%[inptrs], 112]\n"
"fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 72]\n"
+ "ldr x7, [%[inptrs], 72]\n"
"fmla v17.4s, v16.4s, v12.4s\n"
"ldr x17, [%[inptrs], 32]\n"
"fmla v18.4s, v25.4s, v6.4s\n"
@@ -1257,11 +1257,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v0.4s, v21.4s, v12.4s\n"
"ldr q21, [x19, x27]\n"
"fmla v18.4s, v15.4s, v10.4s\n"
- "ldr q20, [x18, x27]\n"
+ "ldr q20, [x7, x27]\n"
"fmla v22.4s, v29.4s, v8.4s\n"
"ldr x19, [%[inptrs], 120]\n"
"fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 80]\n"
+ "ldr x7, [%[inptrs], 80]\n"
"fmla v19.4s, v29.4s, v11.4s\n"
"ldr x25, [%[outptrs], 64]\n"
"fmla v18.4s, v29.4s, v5.4s\n"
@@ -1321,13 +1321,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v1.4s, v30.4s, v6.4s\n"
"fmla v16.4s, v30.4s, v9.4s\n"
"fmla v3.4s, v26.4s, v11.4s\n"
- "ldr q29, [x18, x27]\n"
+ "ldr q29, [x7, x27]\n"
"fmla v15.4s, v21.4s, v12.4s\n"
"ldr q27, [x17, x27]\n"
"fmla v0.4s, v30.4s, v8.4s\n"
"ldr q28, [x22, x27]\n"
"fmla v22.4s, v24.4s, v4.4s\n"
- "ldr x18, [%[inptrs], 88]\n"
+ "ldr x7, [%[inptrs], 88]\n"
"fmla v19.4s, v24.4s, v7.4s\n"
"ldr x22, [%[inptrs], 256]\n"
"fmla v17.4s, v24.4s, v5.4s\n"
@@ -1365,13 +1365,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v0.4s, v28.4s, v5.4s\n"
"ldr x19, [%[inptrs], 136]\n"
"fmla v16.4s, v28.4s, v6.4s\n"
- "ldr q26, [x18, x27]\n"
+ "ldr q26, [x7, x27]\n"
"fmla v3.4s, v27.4s, v10.4s\n"
"ldr q23, [x22, x27]\n"
"fmla v19.4s, v22.4s, v4.4s\n"
"ldr x22, [%[inptrs], 264]\n"
"fmla v0.4s, v22.4s, v7.4s\n"
- "ldr x18, [%[inptrs], 48]\n"
+ "ldr x7, [%[inptrs], 48]\n"
"fmla v1.4s, v22.4s, v5.4s\n"
"fmla v16.4s, v22.4s, v8.4s\n"
"fmla v15.4s, v22.4s, v6.4s\n"
@@ -1435,7 +1435,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v28.4s, v4.4s\n"
"ldr q29, [x17, x27]\n"
"fmla v15.4s, v28.4s, v7.4s\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"fmla v18.4s, v28.4s, v5.4s\n"
"ldr x25, [%[outptrs], 80]\n"
"fmla v21.4s, v28.4s, v10.4s\n"
@@ -1455,13 +1455,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"str q3, [x23, x28]\n"
"fmla v21.4s, v30.4s, v5.4s\n"
"fmla v20.4s, v30.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 56]\n"
+ "ldr x7, [%[inptrs], 56]\n"
"fmla v15.4s, v19.4s, v4.4s\n"
"ldr x17, [%[inptrs], 16]\n"
"str q16, [x26, x28]\n"
"fmla v24.4s, v19.4s, v5.4s\n"
"fmla v21.4s, v19.4s, v7.4s\n"
- "ldr q16, [x18, x27]\n"
+ "ldr q16, [x7, x27]\n"
"fmla v20.4s, v19.4s, v8.4s\n"
"ldr q6, [%[wbptr], #112]\n"
"str q15, [x25, x28]\n"
@@ -1504,11 +1504,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"bne 2b\n"
"3:\n"
"mov v3.16b, v13.16b\n"
- "ldr x18, [%[inptrs], 64]\n"
+ "ldr x7, [%[inptrs], 64]\n"
"fmla v18.4s, v27.4s, v9.4s\n"
"ldr x17, [%[inptrs], 24]\n"
"fmla v22.4s, v27.4s, v12.4s\n"
- "ldr q30, [x18, x27]\n"
+ "ldr q30, [x7, x27]\n"
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr x21, [%[inptrs], 192]\n"
"fmla v19.4s, v25.4s, v12.4s\n"
@@ -1518,7 +1518,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v22.4s, v25.4s, v9.4s\n"
"ldr x19, [%[inptrs], 112]\n"
"fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 72]\n"
+ "ldr x7, [%[inptrs], 72]\n"
"fmla v17.4s, v16.4s, v12.4s\n"
"ldr x17, [%[inptrs], 32]\n"
"fmla v18.4s, v25.4s, v6.4s\n"
@@ -1538,11 +1538,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v0.4s, v21.4s, v12.4s\n"
"ldr q21, [x19, x27]\n"
"fmla v18.4s, v15.4s, v10.4s\n"
- "ldr q20, [x18, x27]\n"
+ "ldr q20, [x7, x27]\n"
"fmla v22.4s, v29.4s, v8.4s\n"
"ldr x19, [%[inptrs], 120]\n"
"fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 80]\n"
+ "ldr x7, [%[inptrs], 80]\n"
"fmla v19.4s, v29.4s, v11.4s\n"
"ldr x25, [%[outptrs], 64]\n"
"fmla v18.4s, v29.4s, v5.4s\n"
@@ -1598,13 +1598,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v18.4s, v20.4s, v12.4s\n"
"ldr q25, [x19, x27]\n"
"fmla v0.4s, v27.4s, v6.4s\n"
- "ldr q29, [x18, x27]\n"
+ "ldr q29, [x7, x27]\n"
"fmla v14.4s, v26.4s, v10.4s\n"
"ldr x19, [%[inptrs], 128]\n"
"fmla v3.4s, v26.4s, v11.4s\n"
"ldr q27, [x17, x27]\n"
"fmla v19.4s, v30.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 88]\n"
+ "ldr x7, [%[inptrs], 88]\n"
"fmla v0.4s, v30.4s, v8.4s\n"
"fmla v1.4s, v30.4s, v6.4s\n"
"fmla v16.4s, v30.4s, v9.4s\n"
@@ -1646,7 +1646,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v18.4s, v29.4s, v11.4s\n"
"ldr q31, [x19, x27]\n"
"fmla v16.4s, v28.4s, v6.4s\n"
- "ldr q26, [x18, x27]\n"
+ "ldr q26, [x7, x27]\n"
"fmla v19.4s, v22.4s, v4.4s\n"
"ldr x19, [%[inptrs], 136]\n"
"fmla v3.4s, v27.4s, v10.4s\n"
@@ -1767,31 +1767,31 @@ void Conv::execute_tile<ActivationFunction::None>(
"mov v2.16b, v13.16b\n"
"ldr s4, [%[wbptr], #36]\n"
"ldr x17, [%[inptrs], 0]\n"
- "ldr x18, [%[inptrs], 48]\n"
+ "ldr x7, [%[inptrs], 48]\n"
"ldr x19, [%[inptrs], 96]\n"
"ldr x20, [%[inptrs], 144]\n"
"subs x15, x15, #1\n"
"ldr s29, [x17, x27]\n"
"fmla v18.4s, v29.4s, v12.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"ldr s25, [x19, x27]\n"
"ldr x17, [%[inptrs], 8]\n"
"ldr s21, [x20, x27]\n"
- "ldr x18, [%[inptrs], 56]\n"
+ "ldr x7, [%[inptrs], 56]\n"
"ldr s28, [x17, x27]\n"
"ldr x19, [%[inptrs], 104]\n"
- "ldr s16, [x18, x27]\n"
+ "ldr s16, [x7, x27]\n"
"ldr x17, [%[inptrs], 16]\n"
"ldr s29, [x19, x27]\n"
"ldr s15, [x17, x27]\n"
"beq 6f\n"
"5:\n"
"mov v3.16b, v13.16b\n"
- "ldr x18, [%[inptrs], 64]\n"
+ "ldr x7, [%[inptrs], 64]\n"
"fmla v18.4s, v27.4s, v9.4s\n"
"ldr x17, [%[inptrs], 24]\n"
"fmla v22.4s, v27.4s, v12.4s\n"
- "ldr s30, [x18, x27]\n"
+ "ldr s30, [x7, x27]\n"
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr x21, [%[inptrs], 192]\n"
"fmla v19.4s, v25.4s, v12.4s\n"
@@ -1801,7 +1801,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v22.4s, v25.4s, v9.4s\n"
"ldr x19, [%[inptrs], 112]\n"
"fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 72]\n"
+ "ldr x7, [%[inptrs], 72]\n"
"fmla v17.4s, v16.4s, v12.4s\n"
"ldr x17, [%[inptrs], 32]\n"
"fmla v18.4s, v25.4s, v6.4s\n"
@@ -1821,11 +1821,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v0.4s, v21.4s, v12.4s\n"
"ldr s21, [x19, x27]\n"
"fmla v18.4s, v15.4s, v10.4s\n"
- "ldr s20, [x18, x27]\n"
+ "ldr s20, [x7, x27]\n"
"fmla v22.4s, v29.4s, v8.4s\n"
"ldr x19, [%[inptrs], 120]\n"
"fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 80]\n"
+ "ldr x7, [%[inptrs], 80]\n"
"fmla v19.4s, v29.4s, v11.4s\n"
"ldr x25, [%[outptrs], 64]\n"
"fmla v18.4s, v29.4s, v5.4s\n"
@@ -1885,13 +1885,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v1.4s, v30.4s, v6.4s\n"
"fmla v16.4s, v30.4s, v9.4s\n"
"fmla v3.4s, v26.4s, v11.4s\n"
- "ldr s29, [x18, x27]\n"
+ "ldr s29, [x7, x27]\n"
"fmla v15.4s, v21.4s, v12.4s\n"
"ldr s27, [x17, x27]\n"
"fmla v0.4s, v30.4s, v8.4s\n"
"ldr s28, [x22, x27]\n"
"fmla v22.4s, v24.4s, v4.4s\n"
- "ldr x18, [%[inptrs], 88]\n"
+ "ldr x7, [%[inptrs], 88]\n"
"fmla v19.4s, v24.4s, v7.4s\n"
"ldr x22, [%[inptrs], 256]\n"
"fmla v17.4s, v24.4s, v5.4s\n"
@@ -1929,13 +1929,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v0.4s, v28.4s, v5.4s\n"
"ldr x19, [%[inptrs], 136]\n"
"fmla v16.4s, v28.4s, v6.4s\n"
- "ldr s26, [x18, x27]\n"
+ "ldr s26, [x7, x27]\n"
"fmla v3.4s, v27.4s, v10.4s\n"
"ldr s23, [x22, x27]\n"
"fmla v19.4s, v22.4s, v4.4s\n"
"ldr x22, [%[inptrs], 264]\n"
"fmla v0.4s, v22.4s, v7.4s\n"
- "ldr x18, [%[inptrs], 48]\n"
+ "ldr x7, [%[inptrs], 48]\n"
"fmla v1.4s, v22.4s, v5.4s\n"
"fmla v16.4s, v22.4s, v8.4s\n"
"fmla v15.4s, v22.4s, v6.4s\n"
@@ -1999,7 +1999,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v2.4s, v28.4s, v4.4s\n"
"ldr s29, [x17, x27]\n"
"fmla v15.4s, v28.4s, v7.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"fmla v18.4s, v28.4s, v5.4s\n"
"ldr x25, [%[outptrs], 80]\n"
"fmla v21.4s, v28.4s, v10.4s\n"
@@ -2019,13 +2019,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"str s3, [x23, x28]\n"
"fmla v21.4s, v30.4s, v5.4s\n"
"fmla v20.4s, v30.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 56]\n"
+ "ldr x7, [%[inptrs], 56]\n"
"fmla v15.4s, v19.4s, v4.4s\n"
"ldr x17, [%[inptrs], 16]\n"
"str s16, [x26, x28]\n"
"fmla v24.4s, v19.4s, v5.4s\n"
"fmla v21.4s, v19.4s, v7.4s\n"
- "ldr s16, [x18, x27]\n"
+ "ldr s16, [x7, x27]\n"
"fmla v20.4s, v19.4s, v8.4s\n"
"ldr s6, [%[wbptr], #28]\n"
"str s15, [x25, x28]\n"
@@ -2068,11 +2068,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"bne 5b\n"
"6:\n"
"mov v3.16b, v13.16b\n"
- "ldr x18, [%[inptrs], 64]\n"
+ "ldr x7, [%[inptrs], 64]\n"
"fmla v18.4s, v27.4s, v9.4s\n"
"ldr x17, [%[inptrs], 24]\n"
"fmla v22.4s, v27.4s, v12.4s\n"
- "ldr s30, [x18, x27]\n"
+ "ldr s30, [x7, x27]\n"
"fmla v23.4s, v28.4s, v12.4s\n"
"ldr x21, [%[inptrs], 192]\n"
"fmla v19.4s, v25.4s, v12.4s\n"
@@ -2082,7 +2082,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v22.4s, v25.4s, v9.4s\n"
"ldr x19, [%[inptrs], 112]\n"
"fmla v23.4s, v16.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 72]\n"
+ "ldr x7, [%[inptrs], 72]\n"
"fmla v17.4s, v16.4s, v12.4s\n"
"ldr x17, [%[inptrs], 32]\n"
"fmla v18.4s, v25.4s, v6.4s\n"
@@ -2102,11 +2102,11 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v0.4s, v21.4s, v12.4s\n"
"ldr s21, [x19, x27]\n"
"fmla v18.4s, v15.4s, v10.4s\n"
- "ldr s20, [x18, x27]\n"
+ "ldr s20, [x7, x27]\n"
"fmla v22.4s, v29.4s, v8.4s\n"
"ldr x19, [%[inptrs], 120]\n"
"fmla v23.4s, v29.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 80]\n"
+ "ldr x7, [%[inptrs], 80]\n"
"fmla v19.4s, v29.4s, v11.4s\n"
"ldr x25, [%[outptrs], 64]\n"
"fmla v18.4s, v29.4s, v5.4s\n"
@@ -2162,13 +2162,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v18.4s, v20.4s, v12.4s\n"
"ldr s25, [x19, x27]\n"
"fmla v0.4s, v27.4s, v6.4s\n"
- "ldr s29, [x18, x27]\n"
+ "ldr s29, [x7, x27]\n"
"fmla v14.4s, v26.4s, v10.4s\n"
"ldr x19, [%[inptrs], 128]\n"
"fmla v3.4s, v26.4s, v11.4s\n"
"ldr s27, [x17, x27]\n"
"fmla v19.4s, v30.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 88]\n"
+ "ldr x7, [%[inptrs], 88]\n"
"fmla v0.4s, v30.4s, v8.4s\n"
"fmla v1.4s, v30.4s, v6.4s\n"
"fmla v16.4s, v30.4s, v9.4s\n"
@@ -2210,7 +2210,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v18.4s, v29.4s, v11.4s\n"
"ldr s31, [x19, x27]\n"
"fmla v16.4s, v28.4s, v6.4s\n"
- "ldr s26, [x18, x27]\n"
+ "ldr s26, [x7, x27]\n"
"fmla v19.4s, v22.4s, v4.4s\n"
"ldr x19, [%[inptrs], 136]\n"
"fmla v3.4s, v27.4s, v10.4s\n"
@@ -2312,7 +2312,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"7:\n"
: [wbptr] "+r" (weight_bias_ptr)
: [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
@@ -2345,7 +2345,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"add x27, x10, %[input_row_stride]\n"
"add x15, x14, #64\n"
"add x17, x16, %[output_row_stride]\n"
- "add x18, x17, %[output_row_stride]\n"
+ "add x7, x17, %[output_row_stride]\n"
"add x19, %[output_col_stride1], %[output_col_stride1]\n"
"and x21, %[n_channels], #3\n"
"add x20, x19, %[output_col_stride1]\n"
@@ -2649,13 +2649,13 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmax v0.4s, v0.4s, v29.4s\n"
"str q25, [x17, x20]\n"
"fmax v18.4s, v18.4s, v29.4s\n"
- "str q0, [x18]\n"
+ "str q0, [x7]\n"
"fmax v23.4s, v23.4s, v29.4s\n"
- "str q18, [x18, %[output_col_stride1]]\n"
+ "str q18, [x7, %[output_col_stride1]]\n"
"fmax v24.4s, v24.4s, v29.4s\n"
- "str q23, [x18, x19]\n"
+ "str q23, [x7, x19]\n"
"mov v7.16b, v21.16b\n"
- "str q24, [x18, x20]\n"
+ "str q24, [x7, x20]\n"
"mov v3.16b, v21.16b\n"
"mov v6.16b, v21.16b\n"
"ldr q9, [%[wbptr], #128]\n"
@@ -2684,7 +2684,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v18.4s, v17.4s\n"
"add x17, x17, #16\n"
"fmla v15.4s, v18.4s, v20.4s\n"
- "add x18, x18, #16\n"
+ "add x7, x7, #16\n"
"fmla v7.4s, v23.4s, v14.4s\n"
"fmla v3.4s, v27.4s, v14.4s\n"
"fmla v7.4s, v18.4s, v10.4s\n"
@@ -2923,15 +2923,15 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmax v0.4s, v0.4s, v29.4s\n"
"str q25, [x17, x20]\n"
"fmax v18.4s, v18.4s, v29.4s\n"
- "str q0, [x18]\n"
+ "str q0, [x7]\n"
"fmax v23.4s, v23.4s, v29.4s\n"
- "str q18, [x18, %[output_col_stride1]]\n"
+ "str q18, [x7, %[output_col_stride1]]\n"
"fmax v24.4s, v24.4s, v29.4s\n"
- "str q23, [x18, x19]\n"
+ "str q23, [x7, x19]\n"
"add x16, x16, #16\n"
- "str q24, [x18, x20]\n"
+ "str q24, [x7, x20]\n"
"add x17, x17, #16\n"
- "add x18, x18, #16\n"
+ "add x7, x7, #16\n"
"4:\n"
"cbz x21, 7f\n"
"ldr s21, [%[wbptr]]\n"
@@ -3231,13 +3231,13 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmax v0.4s, v0.4s, v29.4s\n"
"str s25, [x17, x20]\n"
"fmax v18.4s, v18.4s, v29.4s\n"
- "str s0, [x18]\n"
+ "str s0, [x7]\n"
"fmax v23.4s, v23.4s, v29.4s\n"
- "str s18, [x18, %[output_col_stride1]]\n"
+ "str s18, [x7, %[output_col_stride1]]\n"
"fmax v24.4s, v24.4s, v29.4s\n"
- "str s23, [x18, x19]\n"
+ "str s23, [x7, x19]\n"
"mov v7.16b, v21.16b\n"
- "str s24, [x18, x20]\n"
+ "str s24, [x7, x20]\n"
"mov v3.16b, v21.16b\n"
"mov v6.16b, v21.16b\n"
"ldr s9, [%[wbptr], #32]\n"
@@ -3266,7 +3266,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v18.4s, v17.4s\n"
"add x17, x17, #4\n"
"fmla v15.4s, v18.4s, v20.4s\n"
- "add x18, x18, #4\n"
+ "add x7, x7, #4\n"
"fmla v7.4s, v23.4s, v14.4s\n"
"fmla v3.4s, v27.4s, v14.4s\n"
"fmla v7.4s, v18.4s, v10.4s\n"
@@ -3505,19 +3505,19 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmax v0.4s, v0.4s, v29.4s\n"
"str s25, [x17, x20]\n"
"fmax v18.4s, v18.4s, v29.4s\n"
- "str s0, [x18]\n"
+ "str s0, [x7]\n"
"fmax v23.4s, v23.4s, v29.4s\n"
- "str s18, [x18, %[output_col_stride1]]\n"
+ "str s18, [x7, %[output_col_stride1]]\n"
"fmax v24.4s, v24.4s, v29.4s\n"
- "str s23, [x18, x19]\n"
+ "str s23, [x7, x19]\n"
"add x16, x16, #4\n"
- "str s24, [x18, x20]\n"
+ "str s24, [x7, x20]\n"
"add x17, x17, #4\n"
- "add x18, x18, #4\n"
+ "add x7, x7, #4\n"
"7:\n"
: [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
);
}
@@ -3570,11 +3570,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v26.4s, v19.4s\n"
"ldr x25, [%[inptrs], 16]\n"
"ldr q29, [x17, x27]\n"
- "ldr x18, [%[inptrs], 144]\n"
+ "ldr x7, [%[inptrs], 144]\n"
"ldr x24, [%[inptrs], 104]\n"
"subs x26, x26, #1\n"
"ldr q30, [x25, x27]\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"ldr q21, [x24, x27]\n"
"fmla v2.4s, v31.4s, v9.4s\n"
"beq 3f\n"
@@ -3588,7 +3588,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v28.4s, v5.4s\n"
"ldr x15, [%[inptrs], 192]\n"
"fmla v16.4s, v28.4s, v19.4s\n"
- "ldr x18, [%[inptrs], 152]\n"
+ "ldr x7, [%[inptrs], 152]\n"
"fmla v13.4s, v28.4s, v22.4s\n"
"ldr q26, [x25, x27]\n"
"fmla v18.4s, v29.4s, v19.4s\n"
@@ -3604,9 +3604,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v30.4s, v8.4s\n"
"ldr x15, [%[inptrs], 200]\n"
"fmla v17.4s, v30.4s, v22.4s\n"
- "ldr q29, [x18, x27]\n"
+ "ldr q29, [x7, x27]\n"
"fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 160]\n"
+ "ldr x7, [%[inptrs], 160]\n"
"fmla v13.4s, v27.4s, v19.4s\n"
"ldr x20, [%[outptrs], 0]\n"
"fmla v14.4s, v27.4s, v22.4s\n"
@@ -3668,9 +3668,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"mov v20.16b, v25.16b\n"
"fmla v15.4s, v24.4s, v9.4s\n"
"fmla v21.4s, v24.4s, v22.4s\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 168]\n"
+ "ldr x7, [%[inptrs], 168]\n"
"fmla v17.4s, v23.4s, v8.4s\n"
"ldr q30, [x24, x27]\n"
"fmla v13.4s, v26.4s, v4.4s\n"
@@ -3712,11 +3712,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v10.4s, v26.4s, v5.4s\n"
"ldr q31, [x15, x27]\n"
"fmla v1.4s, v25.4s, v8.4s\n"
- "ldr q29, [x18, x27]\n"
+ "ldr q29, [x7, x27]\n"
"fmla v13.4s, v31.4s, v3.4s\n"
"ldr x15, [%[inptrs], 216]\n"
"fmla v14.4s, v31.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 176]\n"
+ "ldr x7, [%[inptrs], 176]\n"
"fmla v12.4s, v31.4s, v4.4s\n"
"fmla v10.4s, v31.4s, v7.4s\n"
"fmla v11.4s, v31.4s, v5.4s\n"
@@ -3748,11 +3748,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v1.4s, v25.4s, v6.4s\n"
"fmla v10.4s, v29.4s, v4.4s\n"
"fmla v21.4s, v25.4s, v8.4s\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"fmla v20.4s, v29.4s, v5.4s\n"
"ldr q26, [x24, x27]\n"
"fmla v12.4s, v22.4s, v3.4s\n"
- "ldr x18, [%[inptrs], 184]\n"
+ "ldr x7, [%[inptrs], 184]\n"
"fmla v10.4s, v22.4s, v6.4s\n"
"ldr x24, [%[inptrs], 96]\n"
"fmla v11.4s, v22.4s, v4.4s\n"
@@ -3770,7 +3770,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v24.4s, v27.4s, v7.4s\n"
"fmla v23.4s, v27.4s, v9.4s\n"
"fmla v1.4s, v26.4s, v3.4s\n"
- "ldr q22, [x18, x27]\n"
+ "ldr q22, [x7, x27]\n"
"fmla v21.4s, v26.4s, v6.4s\n"
"ldr q19, [x16, x27]\n"
"fmla v10.4s, v25.4s, v3.4s\n"
@@ -3778,7 +3778,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v24.4s, v26.4s, v8.4s\n"
"ldr q28, [x15, x27]\n"
"fmla v20.4s, v25.4s, v4.4s\n"
- "ldr x18, [%[inptrs], 144]\n"
+ "ldr x7, [%[inptrs], 144]\n"
"fmla v23.4s, v25.4s, v5.4s\n"
"ldr q30, [x16, x27]\n"
"fmla v11.4s, v31.4s, v3.4s\n"
@@ -3862,7 +3862,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"mov v15.16b, v25.16b\n"
"ldr x21, [%[outptrs], 56]\n"
"fmla v2.4s, v26.4s, v19.4s\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"str q21, [x21, x28]\n"
"ldr x22, [%[outptrs], 80]\n"
"ldr q21, [x24, x27]\n"
@@ -3886,7 +3886,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v18.4s, v31.4s, v22.4s\n"
"ldr q23, [x17, x27]\n"
"fmla v2.4s, v28.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 152]\n"
+ "ldr x7, [%[inptrs], 152]\n"
"fmla v16.4s, v28.4s, v19.4s\n"
"ldr x24, [%[inptrs], 112]\n"
"fmla v13.4s, v28.4s, v22.4s\n"
@@ -3904,9 +3904,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v30.4s, v8.4s\n"
"ldr x20, [%[outptrs], 0]\n"
"fmla v17.4s, v30.4s, v22.4s\n"
- "ldr q29, [x18, x27]\n"
+ "ldr q29, [x7, x27]\n"
"fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 160]\n"
+ "ldr x7, [%[inptrs], 160]\n"
"fmla v13.4s, v27.4s, v19.4s\n"
"ldr x21, [%[outptrs], 32]\n"
"fmla v14.4s, v27.4s, v22.4s\n"
@@ -3964,11 +3964,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v17.4s, v24.4s, v7.4s\n"
"fmla v21.4s, v24.4s, v22.4s\n"
"fmla v15.4s, v24.4s, v9.4s\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
"ldr q30, [x24, x27]\n"
"fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 168]\n"
+ "ldr x7, [%[inptrs], 168]\n"
"fmla v17.4s, v23.4s, v8.4s\n"
"ldr q31, [x17, x27]\n"
"fmla v13.4s, v26.4s, v4.4s\n"
@@ -4008,11 +4008,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v14.4s, v26.4s, v4.4s\n"
"ldr x15, [%[inptrs], 216]\n"
"fmla v10.4s, v26.4s, v5.4s\n"
- "ldr q29, [x18, x27]\n"
+ "ldr q29, [x7, x27]\n"
"fmla v1.4s, v25.4s, v8.4s\n"
"ldr q28, [x24, x27]\n"
"fmla v13.4s, v31.4s, v3.4s\n"
- "ldr x18, [%[inptrs], 176]\n"
+ "ldr x7, [%[inptrs], 176]\n"
"fmla v14.4s, v31.4s, v6.4s\n"
"ldr x24, [%[inptrs], 136]\n"
"fmla v12.4s, v31.4s, v4.4s\n"
@@ -4040,9 +4040,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v21.4s, v28.4s, v7.4s\n"
"fmla v24.4s, v28.4s, v9.4s\n"
"fmla v14.4s, v29.4s, v3.4s\n"
- "ldr q27, [x18, x27]\n"
+ "ldr q27, [x7, x27]\n"
"fmla v1.4s, v25.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 184]\n"
+ "ldr x7, [%[inptrs], 184]\n"
"fmla v10.4s, v29.4s, v4.4s\n"
"fmla v20.4s, v29.4s, v5.4s\n"
"fmla v21.4s, v25.4s, v8.4s\n"
@@ -4058,7 +4058,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v15.4s, v27.4s, v3.4s\n"
"ldr q31, [x15, x27]\n"
"fmla v11.4s, v27.4s, v6.4s\n"
- "ldr q22, [x18, x27]\n"
+ "ldr q22, [x7, x27]\n"
"fmla v21.4s, v27.4s, v4.4s\n"
"ldr x15, [%[inptrs], 232]\n"
"fmla v20.4s, v27.4s, v8.4s\n"
@@ -4157,14 +4157,14 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"ldr x25, [%[inptrs], 0]\n"
"ldr x17, [%[inptrs], 48]\n"
"ldr x24, [%[inptrs], 96]\n"
- "ldr x18, [%[inptrs], 144]\n"
+ "ldr x7, [%[inptrs], 144]\n"
"subs x19, x19, #1\n"
"ldr s27, [x25, x27]\n"
"fmla v2.4s, v27.4s, v22.4s\n"
"ldr s26, [x17, x27]\n"
"fmla v16.4s, v26.4s, v22.4s\n"
"ldr s28, [x24, x27]\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"ldr x25, [%[inptrs], 8]\n"
"ldr x17, [%[inptrs], 56]\n"
"ldr x24, [%[inptrs], 104]\n"
@@ -4186,7 +4186,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v28.4s, v5.4s\n"
"ldr x15, [%[inptrs], 192]\n"
"fmla v16.4s, v28.4s, v19.4s\n"
- "ldr x18, [%[inptrs], 152]\n"
+ "ldr x7, [%[inptrs], 152]\n"
"fmla v13.4s, v28.4s, v22.4s\n"
"ldr s26, [x25, x27]\n"
"fmla v18.4s, v29.4s, v19.4s\n"
@@ -4202,9 +4202,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v30.4s, v8.4s\n"
"ldr x15, [%[inptrs], 200]\n"
"fmla v17.4s, v30.4s, v22.4s\n"
- "ldr s29, [x18, x27]\n"
+ "ldr s29, [x7, x27]\n"
"fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 160]\n"
+ "ldr x7, [%[inptrs], 160]\n"
"fmla v13.4s, v27.4s, v19.4s\n"
"ldr x20, [%[outptrs], 0]\n"
"fmla v14.4s, v27.4s, v22.4s\n"
@@ -4266,9 +4266,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"mov v20.16b, v25.16b\n"
"fmla v15.4s, v24.4s, v9.4s\n"
"fmla v21.4s, v24.4s, v22.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 168]\n"
+ "ldr x7, [%[inptrs], 168]\n"
"fmla v17.4s, v23.4s, v8.4s\n"
"ldr s30, [x24, x27]\n"
"fmla v13.4s, v26.4s, v4.4s\n"
@@ -4310,11 +4310,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v10.4s, v26.4s, v5.4s\n"
"ldr s31, [x15, x27]\n"
"fmla v1.4s, v25.4s, v8.4s\n"
- "ldr s29, [x18, x27]\n"
+ "ldr s29, [x7, x27]\n"
"fmla v13.4s, v31.4s, v3.4s\n"
"ldr x15, [%[inptrs], 216]\n"
"fmla v14.4s, v31.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 176]\n"
+ "ldr x7, [%[inptrs], 176]\n"
"fmla v12.4s, v31.4s, v4.4s\n"
"fmla v10.4s, v31.4s, v7.4s\n"
"fmla v11.4s, v31.4s, v5.4s\n"
@@ -4346,11 +4346,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v1.4s, v25.4s, v6.4s\n"
"fmla v10.4s, v29.4s, v4.4s\n"
"fmla v21.4s, v25.4s, v8.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"fmla v20.4s, v29.4s, v5.4s\n"
"ldr s26, [x24, x27]\n"
"fmla v12.4s, v22.4s, v3.4s\n"
- "ldr x18, [%[inptrs], 184]\n"
+ "ldr x7, [%[inptrs], 184]\n"
"fmla v10.4s, v22.4s, v6.4s\n"
"ldr x24, [%[inptrs], 96]\n"
"fmla v11.4s, v22.4s, v4.4s\n"
@@ -4368,7 +4368,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v24.4s, v27.4s, v7.4s\n"
"fmla v23.4s, v27.4s, v9.4s\n"
"fmla v1.4s, v26.4s, v3.4s\n"
- "ldr s22, [x18, x27]\n"
+ "ldr s22, [x7, x27]\n"
"fmla v21.4s, v26.4s, v6.4s\n"
"ldr s19, [x16, x27]\n"
"fmla v10.4s, v25.4s, v3.4s\n"
@@ -4376,7 +4376,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v24.4s, v26.4s, v8.4s\n"
"ldr s28, [x15, x27]\n"
"fmla v20.4s, v25.4s, v4.4s\n"
- "ldr x18, [%[inptrs], 144]\n"
+ "ldr x7, [%[inptrs], 144]\n"
"fmla v23.4s, v25.4s, v5.4s\n"
"ldr s30, [x16, x27]\n"
"fmla v11.4s, v31.4s, v3.4s\n"
@@ -4460,7 +4460,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"mov v15.16b, v25.16b\n"
"ldr x21, [%[outptrs], 56]\n"
"fmla v2.4s, v26.4s, v19.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"str s21, [x21, x28]\n"
"ldr x22, [%[outptrs], 80]\n"
"ldr s21, [x24, x27]\n"
@@ -4484,7 +4484,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v18.4s, v31.4s, v22.4s\n"
"ldr s23, [x17, x27]\n"
"fmla v2.4s, v28.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 152]\n"
+ "ldr x7, [%[inptrs], 152]\n"
"fmla v16.4s, v28.4s, v19.4s\n"
"ldr x24, [%[inptrs], 112]\n"
"fmla v13.4s, v28.4s, v22.4s\n"
@@ -4502,9 +4502,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v30.4s, v8.4s\n"
"ldr x20, [%[outptrs], 0]\n"
"fmla v17.4s, v30.4s, v22.4s\n"
- "ldr s29, [x18, x27]\n"
+ "ldr s29, [x7, x27]\n"
"fmla v16.4s, v27.4s, v5.4s\n"
- "ldr x18, [%[inptrs], 160]\n"
+ "ldr x7, [%[inptrs], 160]\n"
"fmla v13.4s, v27.4s, v19.4s\n"
"ldr x21, [%[outptrs], 32]\n"
"fmla v14.4s, v27.4s, v22.4s\n"
@@ -4562,11 +4562,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v17.4s, v24.4s, v7.4s\n"
"fmla v21.4s, v24.4s, v22.4s\n"
"fmla v15.4s, v24.4s, v9.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
"ldr s30, [x24, x27]\n"
"fmla v1.4s, v23.4s, v9.4s\n"
- "ldr x18, [%[inptrs], 168]\n"
+ "ldr x7, [%[inptrs], 168]\n"
"fmla v17.4s, v23.4s, v8.4s\n"
"ldr s31, [x17, x27]\n"
"fmla v13.4s, v26.4s, v4.4s\n"
@@ -4606,11 +4606,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v14.4s, v26.4s, v4.4s\n"
"ldr x15, [%[inptrs], 216]\n"
"fmla v10.4s, v26.4s, v5.4s\n"
- "ldr s29, [x18, x27]\n"
+ "ldr s29, [x7, x27]\n"
"fmla v1.4s, v25.4s, v8.4s\n"
"ldr s28, [x24, x27]\n"
"fmla v13.4s, v31.4s, v3.4s\n"
- "ldr x18, [%[inptrs], 176]\n"
+ "ldr x7, [%[inptrs], 176]\n"
"fmla v14.4s, v31.4s, v6.4s\n"
"ldr x24, [%[inptrs], 136]\n"
"fmla v12.4s, v31.4s, v4.4s\n"
@@ -4638,9 +4638,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v21.4s, v28.4s, v7.4s\n"
"fmla v24.4s, v28.4s, v9.4s\n"
"fmla v14.4s, v29.4s, v3.4s\n"
- "ldr s27, [x18, x27]\n"
+ "ldr s27, [x7, x27]\n"
"fmla v1.4s, v25.4s, v6.4s\n"
- "ldr x18, [%[inptrs], 184]\n"
+ "ldr x7, [%[inptrs], 184]\n"
"fmla v10.4s, v29.4s, v4.4s\n"
"fmla v20.4s, v29.4s, v5.4s\n"
"fmla v21.4s, v25.4s, v8.4s\n"
@@ -4656,7 +4656,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v15.4s, v27.4s, v3.4s\n"
"ldr s31, [x15, x27]\n"
"fmla v11.4s, v27.4s, v6.4s\n"
- "ldr s22, [x18, x27]\n"
+ "ldr s22, [x7, x27]\n"
"fmla v21.4s, v27.4s, v4.4s\n"
"ldr x15, [%[inptrs], 232]\n"
"fmla v20.4s, v27.4s, v8.4s\n"
@@ -4734,7 +4734,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"7:\n"
: [wbptr] "+r" (weight_bias_ptr)
: [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
@@ -4762,7 +4762,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"add x21, x19, #64\n"
"add x17, x19, %[input_col_stride1]\n"
"add x22, x20, %[input_row_stride]\n"
- "add x18, x17, #64\n"
+ "add x7, x17, #64\n"
"add x11, x17, %[input_col_stride1]\n"
"add x23, x22, %[input_row_stride]\n"
"add x12, x11, #64\n"
@@ -4844,7 +4844,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v14.4s, v27.4s, v20.4s\n"
"ldr q26, [x20, %[input_col_stride1]]\n"
"fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v8.4s, v22.4s, v4.4s\n"
"prfm pldl1keep, [x23, #64]\n"
"fmla v11.4s, v22.4s, v2.4s\n"
@@ -4856,7 +4856,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v15.4s, v22.4s, v20.4s\n"
"ldr q30, [x9, x13]\n"
"fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x18]\n"
+ "prfm pldl1keep, [x24, x7]\n"
"fmla v8.4s, v21.4s, v5.4s\n"
"prfm pldl1keep, [%[inptr0], x12]\n"
"fmla v11.4s, v21.4s, v4.4s\n"
@@ -4868,7 +4868,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v17.4s, v21.4s, v20.4s\n"
"ldr q22, [x24, x19]\n"
"fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v10.4s, v25.4s, v6.4s\n"
"prfm pldl1keep, [x24, x12]\n"
"fmla v9.4s, v25.4s, v20.4s\n"
@@ -4880,19 +4880,19 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v26.4s, v1.4s\n"
"prfm pldl1keep, [x22, x21]\n"
"fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x7]\n"
"fmla v7.4s, v26.4s, v2.4s\n"
"prfm pldl1keep, [x9, x12]\n"
"fmla v14.4s, v26.4s, v6.4s\n"
"prfm pldl1keep, [x23, x21]\n"
"fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x7]\n"
"fmla v13.4s, v26.4s, v20.4s\n"
"ldr q26, [x22, %[input_col_stride1]]\n"
"fmla v12.4s, v30.4s, v0.4s\n"
"prfm pldl1keep, [x20, x12]\n"
"fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x7]\n"
"fmla v11.4s, v30.4s, v1.4s\n"
"prfm pldl1keep, [x22, x12]\n"
"fmla v16.4s, v30.4s, v5.4s\n"
@@ -5151,7 +5151,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v14.4s, v27.4s, v20.4s\n"
"ldr q26, [x20, %[input_col_stride1]]\n"
"fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v8.4s, v22.4s, v4.4s\n"
"prfm pldl1keep, [x23, #64]\n"
"fmla v11.4s, v22.4s, v2.4s\n"
@@ -5163,7 +5163,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v15.4s, v22.4s, v20.4s\n"
"ldr q30, [x9, x13]\n"
"fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x18]\n"
+ "prfm pldl1keep, [x24, x7]\n"
"fmla v8.4s, v21.4s, v5.4s\n"
"prfm pldl1keep, [%[inptr0], x12]\n"
"fmla v11.4s, v21.4s, v4.4s\n"
@@ -5175,7 +5175,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v17.4s, v21.4s, v20.4s\n"
"ldr q22, [x24, x19]\n"
"fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v10.4s, v25.4s, v6.4s\n"
"prfm pldl1keep, [x24, x12]\n"
"fmla v9.4s, v25.4s, v20.4s\n"
@@ -5187,19 +5187,19 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v26.4s, v1.4s\n"
"prfm pldl1keep, [x22, x21]\n"
"fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x7]\n"
"fmla v7.4s, v26.4s, v2.4s\n"
"prfm pldl1keep, [x9, x12]\n"
"fmla v14.4s, v26.4s, v6.4s\n"
"prfm pldl1keep, [x23, x21]\n"
"fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x7]\n"
"fmla v13.4s, v26.4s, v20.4s\n"
"ldr q26, [x22, %[input_col_stride1]]\n"
"fmla v12.4s, v30.4s, v0.4s\n"
"prfm pldl1keep, [x20, x12]\n"
"fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x7]\n"
"fmla v11.4s, v30.4s, v1.4s\n"
"prfm pldl1keep, [x22, x12]\n"
"fmla v16.4s, v30.4s, v5.4s\n"
@@ -5460,7 +5460,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v14.4s, v27.4s, v20.4s\n"
"ldr s26, [x20, %[input_col_stride1]]\n"
"fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v8.4s, v22.4s, v4.4s\n"
"prfm pldl1keep, [x23, #64]\n"
"fmla v11.4s, v22.4s, v2.4s\n"
@@ -5472,7 +5472,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v15.4s, v22.4s, v20.4s\n"
"ldr s30, [x9, x13]\n"
"fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x18]\n"
+ "prfm pldl1keep, [x24, x7]\n"
"fmla v8.4s, v21.4s, v5.4s\n"
"prfm pldl1keep, [%[inptr0], x12]\n"
"fmla v11.4s, v21.4s, v4.4s\n"
@@ -5484,7 +5484,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v17.4s, v21.4s, v20.4s\n"
"ldr s22, [x24, x19]\n"
"fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v10.4s, v25.4s, v6.4s\n"
"prfm pldl1keep, [x24, x12]\n"
"fmla v9.4s, v25.4s, v20.4s\n"
@@ -5496,19 +5496,19 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v26.4s, v1.4s\n"
"prfm pldl1keep, [x22, x21]\n"
"fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x7]\n"
"fmla v7.4s, v26.4s, v2.4s\n"
"prfm pldl1keep, [x9, x12]\n"
"fmla v14.4s, v26.4s, v6.4s\n"
"prfm pldl1keep, [x23, x21]\n"
"fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x7]\n"
"fmla v13.4s, v26.4s, v20.4s\n"
"ldr s26, [x22, %[input_col_stride1]]\n"
"fmla v12.4s, v30.4s, v0.4s\n"
"prfm pldl1keep, [x20, x12]\n"
"fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x7]\n"
"fmla v11.4s, v30.4s, v1.4s\n"
"prfm pldl1keep, [x22, x12]\n"
"fmla v16.4s, v30.4s, v5.4s\n"
@@ -5767,7 +5767,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v14.4s, v27.4s, v20.4s\n"
"ldr s26, [x20, %[input_col_stride1]]\n"
"fmla v12.4s, v22.4s, v1.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x7]\n"
"fmla v8.4s, v22.4s, v4.4s\n"
"prfm pldl1keep, [x23, #64]\n"
"fmla v11.4s, v22.4s, v2.4s\n"
@@ -5779,7 +5779,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v15.4s, v22.4s, v20.4s\n"
"ldr s30, [x9, x13]\n"
"fmla v12.4s, v21.4s, v3.4s\n"
- "prfm pldl1keep, [x24, x18]\n"
+ "prfm pldl1keep, [x24, x7]\n"
"fmla v8.4s, v21.4s, v5.4s\n"
"prfm pldl1keep, [%[inptr0], x12]\n"
"fmla v11.4s, v21.4s, v4.4s\n"
@@ -5791,7 +5791,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v17.4s, v21.4s, v20.4s\n"
"ldr s22, [x24, x19]\n"
"fmla v11.4s, v25.4s, v5.4s\n"
- "prfm pldl1keep, [x9, x18]\n"
+ "prfm pldl1keep, [x9, x7]\n"
"fmla v10.4s, v25.4s, v6.4s\n"
"prfm pldl1keep, [x24, x12]\n"
"fmla v9.4s, v25.4s, v20.4s\n"
@@ -5803,19 +5803,19 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v26.4s, v1.4s\n"
"prfm pldl1keep, [x22, x21]\n"
"fmla v16.4s, v26.4s, v4.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x7]\n"
"fmla v7.4s, v26.4s, v2.4s\n"
"prfm pldl1keep, [x9, x12]\n"
"fmla v14.4s, v26.4s, v6.4s\n"
"prfm pldl1keep, [x23, x21]\n"
"fmla v15.4s, v26.4s, v19.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x7]\n"
"fmla v13.4s, v26.4s, v20.4s\n"
"ldr s26, [x22, %[input_col_stride1]]\n"
"fmla v12.4s, v30.4s, v0.4s\n"
"prfm pldl1keep, [x20, x12]\n"
"fmla v8.4s, v30.4s, v3.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x7]\n"
"fmla v11.4s, v30.4s, v1.4s\n"
"prfm pldl1keep, [x22, x12]\n"
"fmla v16.4s, v30.4s, v5.4s\n"
@@ -6007,7 +6007,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"7:\n"
: [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
);
}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
index 908fc8292a..e4aad76d97 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
@@ -44,9 +44,9 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
__asm__ __volatile__(
"ldr q0, [%[pcoeffs]]\n"
"add x25, %[inptr0], %[input_row_stride]\n"
- "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x9, %[input_col_stride1], %[input_col_stride1]\n"
"add x16, x25, %[input_row_stride]\n"
- "add x19, x18, %[input_col_stride1]\n"
+ "add x19, x9, %[input_col_stride1]\n"
"add x26, x16, %[input_row_stride]\n"
"add x20, x19, %[input_col_stride1]\n"
"add x17, x26, %[input_row_stride]\n"
@@ -65,7 +65,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"blt 2f\n"
"1:\n"
"ldr q8, [%[inptr0], x20]\n"
- "ldr q2, [%[inptr0], x18]\n"
+ "ldr q2, [%[inptr0], x9]\n"
"mov v14.16b, v8.16b\n"
"ldr q9, [%[inptr0]]\n"
"mov v10.16b, v8.16b\n"
@@ -77,7 +77,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fmls v10.4s, v12.4s, v0.s[2]\n"
"ldr q5, [x16, x20]\n"
"fmls v14.4s, v2.4s, v0.s[3]\n"
- "ldr q20, [x16, x18]\n"
+ "ldr q20, [x16, x9]\n"
"fmla v9.4s, v12.4s, v0.s[2]\n"
"ldr q3, [x16]\n"
"fmls v10.4s, v2.4s, v0.s[2]\n"
@@ -89,7 +89,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fadd v10.4s, v10.4s, v4.4s\n"
"ldr q17, [x17, x20]\n"
"fmls v7.4s, v12.4s, v0.s[1]\n"
- "ldr q15, [x17, x18]\n"
+ "ldr q15, [x17, x9]\n"
"fsub v9.4s, v9.4s, v4.4s\n"
"ldr q19, [x17]\n"
"mov v8.16b, v8.16b\n"
@@ -180,7 +180,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"mov v25.16b, v19.16b\n"
"ldr q11, [x25, x20]\n"
"mov v10.16b, v11.16b\n"
- "ldr q23, [x25, x18]\n"
+ "ldr q23, [x25, x9]\n"
"mov v9.16b, v11.16b\n"
"ldr q7, [x25]\n"
"fmla v10.4s, v7.4s, v0.s[2]\n"
@@ -192,7 +192,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fmls v10.4s, v23.4s, v0.s[3]\n"
"ldr q30, [x26, x20]\n"
"fmls v9.4s, v21.4s, v0.s[2]\n"
- "ldr q29, [x26, x18]\n"
+ "ldr q29, [x26, x9]\n"
"fmla v7.4s, v21.4s, v0.s[2]\n"
"ldr q22, [x26]\n"
"fmls v8.4s, v21.4s, v0.s[1]\n"
@@ -360,7 +360,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"add x14, x14, #16\n"
"ldr q2, [x27, x20]\n"
"mov v4.16b, v2.16b\n"
- "ldr q17, [x27, x18]\n"
+ "ldr q17, [x27, x9]\n"
"mov v12.16b, v2.16b\n"
"ldr q18, [x27]\n"
"fmla v4.4s, v18.4s, v0.s[2]\n"
@@ -420,7 +420,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"blt 3f\n"
"ldr d8, [%[inptr0], x20]\n"
"mov v14.16b, v8.16b\n"
- "ldr d2, [%[inptr0], x18]\n"
+ "ldr d2, [%[inptr0], x9]\n"
"mov v10.16b, v8.16b\n"
"ldr d9, [%[inptr0]]\n"
"fmla v14.4s, v9.4s, v0.s[2]\n"
@@ -432,7 +432,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fmls v14.4s, v2.4s, v0.s[3]\n"
"ldr d5, [x16, x20]\n"
"fmls v10.4s, v12.4s, v0.s[2]\n"
- "ldr d20, [x16, x18]\n"
+ "ldr d20, [x16, x9]\n"
"fmla v9.4s, v12.4s, v0.s[2]\n"
"ldr d3, [x16]\n"
"fmls v7.4s, v12.4s, v0.s[1]\n"
@@ -444,7 +444,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fsub v7.4s, v7.4s, v2.4s\n"
"ldr d17, [x17, x20]\n"
"fadd v10.4s, v10.4s, v4.4s\n"
- "ldr d15, [x17, x18]\n"
+ "ldr d15, [x17, x9]\n"
"fsub v9.4s, v9.4s, v4.4s\n"
"ldr d19, [x17]\n"
"fmla v7.4s, v4.4s, v0.s[1]\n"
@@ -534,7 +534,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"mov v25.16b, v19.16b\n"
"ldr d11, [x25, x20]\n"
"mov v10.16b, v11.16b\n"
- "ldr d23, [x25, x18]\n"
+ "ldr d23, [x25, x9]\n"
"mov v9.16b, v11.16b\n"
"ldr d7, [x25]\n"
"fmla v10.4s, v7.4s, v0.s[2]\n"
@@ -546,7 +546,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fmls v10.4s, v23.4s, v0.s[3]\n"
"ldr d30, [x26, x20]\n"
"fmls v9.4s, v21.4s, v0.s[2]\n"
- "ldr d29, [x26, x18]\n"
+ "ldr d29, [x26, x9]\n"
"fmla v7.4s, v21.4s, v0.s[2]\n"
"ldr d22, [x26]\n"
"fmls v8.4s, v21.4s, v0.s[1]\n"
@@ -714,7 +714,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"add x14, x14, #8\n"
"ldr d2, [x27, x20]\n"
"mov v4.16b, v2.16b\n"
- "ldr d17, [x27, x18]\n"
+ "ldr d17, [x27, x9]\n"
"mov v12.16b, v2.16b\n"
"ldr d18, [x27]\n"
"fmla v4.4s, v18.4s, v0.s[2]\n"
@@ -771,7 +771,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"cbz %w[n_channels], 4f\n"
"ldr s8, [%[inptr0], x20]\n"
"mov v14.16b, v8.16b\n"
- "ldr s2, [%[inptr0], x18]\n"
+ "ldr s2, [%[inptr0], x9]\n"
"mov v10.16b, v8.16b\n"
"ldr s9, [%[inptr0]]\n"
"fmla v14.4s, v9.4s, v0.s[2]\n"
@@ -783,7 +783,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fmls v14.4s, v2.4s, v0.s[3]\n"
"ldr s5, [x16, x20]\n"
"fmls v10.4s, v12.4s, v0.s[2]\n"
- "ldr s20, [x16, x18]\n"
+ "ldr s20, [x16, x9]\n"
"fmla v9.4s, v12.4s, v0.s[2]\n"
"ldr s3, [x16]\n"
"fmls v7.4s, v12.4s, v0.s[1]\n"
@@ -795,7 +795,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fsub v7.4s, v7.4s, v2.4s\n"
"ldr s17, [x17, x20]\n"
"fadd v10.4s, v10.4s, v4.4s\n"
- "ldr s15, [x17, x18]\n"
+ "ldr s15, [x17, x9]\n"
"fsub v9.4s, v9.4s, v4.4s\n"
"ldr s19, [x17]\n"
"fmla v7.4s, v4.4s, v0.s[1]\n"
@@ -885,7 +885,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"mov v25.16b, v19.16b\n"
"ldr s11, [x25, x20]\n"
"mov v10.16b, v11.16b\n"
- "ldr s23, [x25, x18]\n"
+ "ldr s23, [x25, x9]\n"
"mov v9.16b, v11.16b\n"
"ldr s7, [x25]\n"
"fmla v10.4s, v7.4s, v0.s[2]\n"
@@ -897,7 +897,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"fmls v10.4s, v23.4s, v0.s[3]\n"
"ldr s30, [x26, x20]\n"
"fmls v9.4s, v21.4s, v0.s[2]\n"
- "ldr s29, [x26, x18]\n"
+ "ldr s29, [x26, x9]\n"
"fmla v7.4s, v21.4s, v0.s[2]\n"
"ldr s22, [x26]\n"
"fmls v8.4s, v21.4s, v0.s[1]\n"
@@ -1065,7 +1065,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
"add x14, x14, #4\n"
"ldr s2, [x27, x20]\n"
"mov v4.16b, v2.16b\n"
- "ldr s17, [x27, x18]\n"
+ "ldr s17, [x27, x9]\n"
"mov v12.16b, v2.16b\n"
"ldr s18, [x27]\n"
"fmla v4.4s, v18.4s, v0.s[2]\n"
@@ -1129,7 +1129,7 @@ void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile
: "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
- "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
+ "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x9", "x19",
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}