aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2020-03-10 15:33:57 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-03-10 18:00:14 +0000
commitce3a7b27f80960e88415bb6cabbb75de2239cea8 (patch)
treee7d6021996a62632c08f6cce81f73467754530e1 /src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
parenta26e166829f4d4c48864b1b7243e4e267373d0fd (diff)
downloadComputeLibrary-ce3a7b27f80960e88415bb6cabbb75de2239cea8.tar.gz
COMPMID-3259: Fix scalar register allocation
The Aarch64 ABI reserves X18 for platform ABIs, replace all references to X18 with a different register which doesn't have a special purpose. Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: Ia9e059d44c5edda216bea169d0418bb7a8c4311b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2863 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Nikhil Raj Arm <nikhil.raj@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp')
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp138
1 files changed, 69 insertions, 69 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 2508ec7aeb..4661373e12 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -54,7 +54,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"add x16, x15, #64\n"
"add x17, x15, %[input_col_stride1]\n"
"add x23, x22, %[input_row_stride]\n"
- "add x18, x17, #64\n"
+ "add x9, x17, #64\n"
"add x25, x24, %[output_row_stride]\n"
"add x26, %[output_col_stride1], %[output_col_stride1]\n"
"and x27, %[n_channels], #3\n"
@@ -133,7 +133,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -143,13 +143,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr q21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -159,9 +159,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr q19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -305,7 +305,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -315,13 +315,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr q21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -331,9 +331,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr q19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -479,7 +479,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -489,13 +489,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr s21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -505,9 +505,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr s19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -651,7 +651,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v13.4s, v19.4s, v4.4s\n"
"prfm pldl1keep, [x20, x16]\n"
"fmla v15.4s, v19.4s, v2.4s\n"
- "prfm pldl1keep, [%[inptr0], x18]\n"
+ "prfm pldl1keep, [%[inptr0], x9]\n"
"fmla v10.4s, v19.4s, v7.4s\n"
"prfm pldl1keep, [x23, x19]\n"
"fmla v12.4s, v19.4s, v5.4s\n"
@@ -661,13 +661,13 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v17.4s, v30.4s, v3.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v13.4s, v30.4s, v6.4s\n"
- "prfm pldl1keep, [x20, x18]\n"
+ "prfm pldl1keep, [x20, x9]\n"
"fmla v15.4s, v30.4s, v4.4s\n"
"prfm pldl1keep, [x23, x14]\n"
"fmla v12.4s, v30.4s, v7.4s\n"
"prfm pldl1keep, [x22, x16]\n"
"fmla v14.4s, v30.4s, v5.4s\n"
- "prfm pldl1keep, [x21, x18]\n"
+ "prfm pldl1keep, [x21, x9]\n"
"fmla v11.4s, v30.4s, v16.4s\n"
"ldr s21, [x21, x15]\n"
"fmla v15.4s, v29.4s, v6.4s\n"
@@ -677,9 +677,9 @@ void Conv::execute_tile<ActivationFunction::None>(
"fmla v10.4s, v28.4s, v2.4s\n"
"ldr s19, [x23, x13]\n"
"fmla v13.4s, v24.4s, v1.4s\n"
- "prfm pldl1keep, [x22, x18]\n"
+ "prfm pldl1keep, [x22, x9]\n"
"fmla v12.4s, v24.4s, v2.4s\n"
- "prfm pldl1keep, [x23, x18]\n"
+ "prfm pldl1keep, [x23, x9]\n"
"fmla v10.4s, v24.4s, v4.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v9.4s, v24.4s, v5.4s\n"
@@ -754,7 +754,7 @@ void Conv::execute_tile<ActivationFunction::None>(
"7:\n"
: [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
@@ -780,9 +780,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"add x26, x16, %[input_col_stride1]\n"
"add x13, x22, %[input_row_stride]\n"
"add x20, x26, #64\n"
- "add x18, x26, %[input_col_stride1]\n"
+ "add x9, x26, %[input_col_stride1]\n"
"add x24, x13, %[input_row_stride]\n"
- "add x15, x18, #64\n"
+ "add x15, x9, #64\n"
"add x14, x21, %[output_row_stride]\n"
"add x19, %[output_col_stride1], %[output_col_stride1]\n"
"and x27, %[n_channels], #3\n"
@@ -854,7 +854,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr q20, [%[inptr0], x18]\n"
+ "ldr q20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -886,7 +886,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr q22, [x25, x18]\n"
+ "ldr q22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr q21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -912,7 +912,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr q26, [x22, x18]\n"
+ "ldr q26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr q18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -920,11 +920,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"prfm pldl1keep, [x22, #64]\n"
"fmla v8.4s, v23.4s, v0.4s\n"
- "ldr q23, [x13, x18]\n"
+ "ldr q23, [x13, x9]\n"
"fmla v7.4s, v28.4s, v10.4s\n"
"prfm pldl1keep, [x22, x17]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
- "ldr q25, [x24, x18]\n"
+ "ldr q25, [x24, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
"ldr q20, [%[wbptr]]\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1036,7 +1036,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr q20, [%[inptr0], x18]\n"
+ "ldr q20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -1068,7 +1068,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #160\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr q22, [x25, x18]\n"
+ "ldr q22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr q21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -1088,7 +1088,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr q26, [x22, x18]\n"
+ "ldr q26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr q18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -1096,9 +1096,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"fmla v8.4s, v23.4s, v0.4s\n"
"fmla v7.4s, v28.4s, v10.4s\n"
- "ldr q23, [x13, x18]\n"
+ "ldr q23, [x13, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
- "ldr q25, [x24, x18]\n"
+ "ldr q25, [x24, x9]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
"add x13, x13, #16\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1220,7 +1220,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr s20, [%[inptr0], x18]\n"
+ "ldr s20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -1252,7 +1252,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr s22, [x25, x18]\n"
+ "ldr s22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr s21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -1278,7 +1278,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr s26, [x22, x18]\n"
+ "ldr s26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr s18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -1286,11 +1286,11 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"prfm pldl1keep, [x22, #64]\n"
"fmla v8.4s, v23.4s, v0.4s\n"
- "ldr s23, [x13, x18]\n"
+ "ldr s23, [x13, x9]\n"
"fmla v7.4s, v28.4s, v10.4s\n"
"prfm pldl1keep, [x22, x17]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
- "ldr s25, [x24, x18]\n"
+ "ldr s25, [x24, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
"ldr s20, [%[wbptr]]\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1402,7 +1402,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v31.4s, v0.4s\n"
"prfm pldl1keep, [x25, x20]\n"
"fmla v2.4s, v31.4s, v15.4s\n"
- "ldr s20, [%[inptr0], x18]\n"
+ "ldr s20, [%[inptr0], x9]\n"
"fmla v1.4s, v28.4s, v11.4s\n"
"prfm pldl1keep, [%[inptr0], x15]\n"
"fmla v7.4s, v28.4s, v16.4s\n"
@@ -1434,7 +1434,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v3.4s, v29.4s, v13.4s\n"
"add %[wbptr], %[wbptr], #40\n"
"fmla v2.4s, v29.4s, v0.4s\n"
- "ldr s22, [x25, x18]\n"
+ "ldr s22, [x25, x9]\n"
"fmla v7.4s, v30.4s, v11.4s\n"
"ldr s21, [x24, x16]\n"
"fmla v1.4s, v25.4s, v10.4s\n"
@@ -1454,7 +1454,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v6.4s, v26.4s, v0.4s\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"fmla v5.4s, v26.4s, v15.4s\n"
- "ldr s26, [x22, x18]\n"
+ "ldr s26, [x22, x9]\n"
"fmla v3.4s, v23.4s, v17.4s\n"
"ldr s18, [x24, x26]\n"
"fmla v9.4s, v23.4s, v13.4s\n"
@@ -1462,9 +1462,9 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"fmla v2.4s, v23.4s, v12.4s\n"
"fmla v8.4s, v23.4s, v0.4s\n"
"fmla v7.4s, v28.4s, v10.4s\n"
- "ldr s23, [x13, x18]\n"
+ "ldr s23, [x13, x9]\n"
"fmla v6.4s, v28.4s, v11.4s\n"
- "ldr s25, [x24, x18]\n"
+ "ldr s25, [x24, x9]\n"
"fmla v2.4s, v20.4s, v13.4s\n"
"add x13, x13, #4\n"
"fmla v1.4s, v27.4s, v14.4s\n"
@@ -1522,7 +1522,7 @@ void Conv::execute_tile<ActivationFunction::ReLU>(
"7:\n"
: [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
: [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}
@@ -1541,11 +1541,11 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
{
__asm __volatile(
"add x17, %[inptr0], %[input_row_stride]\n"
- "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x9, %[input_col_stride1], %[input_col_stride1]\n"
"add x25, %[outptr0], %[output_row_stride]\n"
"add x14, x17, %[input_row_stride]\n"
- "add x22, x18, #64\n"
- "add x15, x18, %[input_col_stride1]\n"
+ "add x22, x9, #64\n"
+ "add x15, x9, %[input_col_stride1]\n"
"add x21, x14, %[input_row_stride]\n"
"add x16, x15, #64\n"
"add x24, x15, %[input_col_stride1]\n"
@@ -1583,7 +1583,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"ldr q26, [%[inptr0], %[input_col_stride1]]\n"
"ldr q20, [x14]\n"
"ldr q22, [x17, %[input_col_stride1]]\n"
- "ldr q28, [%[inptr0], x18]\n"
+ "ldr q28, [%[inptr0], x9]\n"
"ldr q23, [x21]\n"
"fmla v8.4s, v27.4s, v14.4s\n"
"ldr q18, [x14, %[input_col_stride1]]\n"
@@ -1598,7 +1598,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"beq 3f\n"
"2:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr q27, [x17, x18]\n"
+ "ldr q27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr q30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -1608,7 +1608,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr q29, [x14, x18]\n"
+ "ldr q29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -1638,7 +1638,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr q25, [x21, x18]\n"
+ "ldr q25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -1656,7 +1656,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr q26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr q20, [x26, x18]\n"
+ "ldr q20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -1754,7 +1754,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmin v5.4s, v5.4s, v29.4s\n"
"ldr q12, [%[wbptr], #96]\n"
"fmax v4.4s, v4.4s, v30.4s\n"
- "ldr q28, [%[inptr0], x18]\n"
+ "ldr q28, [%[inptr0], x9]\n"
"str q5, [x25]\n"
"fmax v3.4s, v3.4s, v30.4s\n"
"fmin v4.4s, v4.4s, v29.4s\n"
@@ -1790,7 +1790,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"bne 2b\n"
"3:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr q27, [x17, x18]\n"
+ "ldr q27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr q30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -1800,7 +1800,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr q29, [x14, x18]\n"
+ "ldr q29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -1830,7 +1830,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr q25, [x21, x18]\n"
+ "ldr q25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -1848,7 +1848,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr q26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr q20, [x26, x18]\n"
+ "ldr q20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -1969,7 +1969,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"ldr s26, [%[inptr0], %[input_col_stride1]]\n"
"ldr s20, [x14]\n"
"ldr s22, [x17, %[input_col_stride1]]\n"
- "ldr s28, [%[inptr0], x18]\n"
+ "ldr s28, [%[inptr0], x9]\n"
"fmla v8.4s, v27.4s, v14.4s\n"
"ldr s23, [x21]\n"
"ldr s18, [x14, %[input_col_stride1]]\n"
@@ -1984,7 +1984,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"beq 6f\n"
"5:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr s27, [x17, x18]\n"
+ "ldr s27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr s30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -1994,7 +1994,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr s29, [x14, x18]\n"
+ "ldr s29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -2024,7 +2024,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr s25, [x21, x18]\n"
+ "ldr s25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -2042,7 +2042,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr s26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr s20, [x26, x18]\n"
+ "ldr s20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -2140,7 +2140,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmin v5.4s, v5.4s, v29.4s\n"
"ldr s12, [%[wbptr], #24]\n"
"fmax v4.4s, v4.4s, v30.4s\n"
- "ldr s28, [%[inptr0], x18]\n"
+ "ldr s28, [%[inptr0], x9]\n"
"str s5, [x25]\n"
"fmax v3.4s, v3.4s, v30.4s\n"
"fmin v4.4s, v4.4s, v29.4s\n"
@@ -2176,7 +2176,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"bne 5b\n"
"6:\n"
"fmla v5.4s, v27.4s, v17.4s\n"
- "ldr s27, [x17, x18]\n"
+ "ldr s27, [x17, x9]\n"
"fmla v8.4s, v26.4s, v16.4s\n"
"ldr s30, [%[inptr0], x15]\n"
"fmla v7.4s, v26.4s, v17.4s\n"
@@ -2186,7 +2186,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v8.4s, v20.4s, v11.4s\n"
"prfm pldl1keep, [x17, x22]\n"
"fmla v2.4s, v20.4s, v17.4s\n"
- "ldr s29, [x14, x18]\n"
+ "ldr s29, [x14, x9]\n"
"fmla v5.4s, v22.4s, v16.4s\n"
"prfm pldl1keep, [%[inptr0], x16]\n"
"fmla v8.4s, v22.4s, v13.4s\n"
@@ -2216,7 +2216,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v4.4s, v18.4s, v14.4s\n"
"prfm pldl1keep, [x26, x22]\n"
"fmla v1.4s, v18.4s, v17.4s\n"
- "ldr s25, [x21, x18]\n"
+ "ldr s25, [x21, x9]\n"
"fmla v8.4s, v27.4s, v12.4s\n"
"prfm pldl1keep, [x21, x16]\n"
"fmla v5.4s, v27.4s, v15.4s\n"
@@ -2234,7 +2234,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"fmla v6.4s, v30.4s, v16.4s\n"
"ldr s26, [x17, x24]\n"
"fmla v2.4s, v31.4s, v11.4s\n"
- "ldr s20, [x26, x18]\n"
+ "ldr s20, [x26, x9]\n"
"fmla v5.4s, v24.4s, v10.4s\n"
"prfm pldl1keep, [%[wbptr], #64]\n"
"fmla v4.4s, v24.4s, v11.4s\n"
@@ -2330,7 +2330,7 @@ void Conv::execute_tile<ActivationFunction::ReLU6>(
"7:\n"
: [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
: [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
- : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
);
}