From ce3a7b27f80960e88415bb6cabbb75de2239cea8 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 10 Mar 2020 15:33:57 +0000 Subject: COMPMID-3259: Fix scalar register allocation The Aarch64 ABI reserves X18 for platform ABIs, replace all references to X18 with a different register which doesn't have a special purpose. Signed-off-by: Georgios Pinitas Change-Id: Ia9e059d44c5edda216bea169d0418bb7a8c4311b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2863 Tested-by: Arm Jenkins Reviewed-by: Nikhil Raj Arm Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp | 138 ++++++++++----------- 1 file changed, 69 insertions(+), 69 deletions(-) (limited to 'src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp') diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp index 2508ec7aeb..4661373e12 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp @@ -54,7 +54,7 @@ void Conv::execute_tile( "add x16, x15, #64\n" "add x17, x15, %[input_col_stride1]\n" "add x23, x22, %[input_row_stride]\n" - "add x18, x17, #64\n" + "add x9, x17, #64\n" "add x25, x24, %[output_row_stride]\n" "add x26, %[output_col_stride1], %[output_col_stride1]\n" "and x27, %[n_channels], #3\n" @@ -133,7 +133,7 @@ void Conv::execute_tile( "fmla v13.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x9]\n" "fmla v10.4s, v19.4s, v7.4s\n" "prfm pldl1keep, [x23, x19]\n" "fmla v12.4s, v19.4s, v5.4s\n" @@ -143,13 +143,13 @@ void Conv::execute_tile( "fmla v17.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x9]\n" "fmla v15.4s, v30.4s, v4.4s\n" "prfm pldl1keep, [x23, x14]\n" "fmla v12.4s, v30.4s, v7.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x18]\n" + "prfm pldl1keep, [x21, x9]\n" "fmla v11.4s, v30.4s, v16.4s\n" "ldr q21, [x21, x15]\n" "fmla v15.4s, v29.4s, v6.4s\n" @@ -159,9 +159,9 @@ void Conv::execute_tile( "fmla v10.4s, v28.4s, v2.4s\n" "ldr q19, [x23, x13]\n" "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x9]\n" "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x9]\n" "fmla v10.4s, v24.4s, v4.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v9.4s, v24.4s, v5.4s\n" @@ -305,7 +305,7 @@ void Conv::execute_tile( "fmla v13.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x9]\n" "fmla v10.4s, v19.4s, v7.4s\n" "prfm pldl1keep, [x23, x19]\n" "fmla v12.4s, v19.4s, v5.4s\n" @@ -315,13 +315,13 @@ void Conv::execute_tile( "fmla v17.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x9]\n" "fmla v15.4s, v30.4s, v4.4s\n" "prfm pldl1keep, [x23, x14]\n" "fmla v12.4s, v30.4s, v7.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x18]\n" + "prfm pldl1keep, [x21, x9]\n" "fmla v11.4s, v30.4s, v16.4s\n" "ldr q21, [x21, x15]\n" "fmla v15.4s, v29.4s, v6.4s\n" @@ -331,9 +331,9 @@ void Conv::execute_tile( "fmla v10.4s, v28.4s, v2.4s\n" "ldr q19, [x23, x13]\n" "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x9]\n" "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x9]\n" "fmla v10.4s, v24.4s, v4.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v9.4s, v24.4s, v5.4s\n" @@ -479,7 +479,7 @@ void Conv::execute_tile( "fmla v13.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x9]\n" "fmla v10.4s, v19.4s, v7.4s\n" "prfm pldl1keep, [x23, x19]\n" "fmla v12.4s, v19.4s, v5.4s\n" @@ -489,13 +489,13 @@ void Conv::execute_tile( "fmla v17.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x9]\n" "fmla v15.4s, v30.4s, v4.4s\n" "prfm pldl1keep, [x23, x14]\n" "fmla v12.4s, v30.4s, v7.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x18]\n" + "prfm pldl1keep, [x21, x9]\n" "fmla v11.4s, v30.4s, v16.4s\n" "ldr s21, [x21, x15]\n" "fmla v15.4s, v29.4s, v6.4s\n" @@ -505,9 +505,9 @@ void Conv::execute_tile( "fmla v10.4s, v28.4s, v2.4s\n" "ldr s19, [x23, x13]\n" "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x9]\n" "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x9]\n" "fmla v10.4s, v24.4s, v4.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v9.4s, v24.4s, v5.4s\n" @@ -651,7 +651,7 @@ void Conv::execute_tile( "fmla v13.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x9]\n" "fmla v10.4s, v19.4s, v7.4s\n" "prfm pldl1keep, [x23, x19]\n" "fmla v12.4s, v19.4s, v5.4s\n" @@ -661,13 +661,13 @@ void Conv::execute_tile( "fmla v17.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x9]\n" "fmla v15.4s, v30.4s, v4.4s\n" "prfm pldl1keep, [x23, x14]\n" "fmla v12.4s, v30.4s, v7.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x18]\n" + "prfm pldl1keep, [x21, x9]\n" "fmla v11.4s, v30.4s, v16.4s\n" "ldr s21, [x21, x15]\n" "fmla v15.4s, v29.4s, v6.4s\n" @@ -677,9 +677,9 @@ void Conv::execute_tile( "fmla v10.4s, v28.4s, v2.4s\n" "ldr s19, [x23, x13]\n" "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x9]\n" "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x9]\n" "fmla v10.4s, v24.4s, v4.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v9.4s, v24.4s, v5.4s\n" @@ -754,7 +754,7 @@ void Conv::execute_tile( "7:\n" : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" ); } @@ -780,9 +780,9 @@ void Conv::execute_tile( "add x26, x16, %[input_col_stride1]\n" "add x13, x22, %[input_row_stride]\n" "add x20, x26, #64\n" - "add x18, x26, %[input_col_stride1]\n" + "add x9, x26, %[input_col_stride1]\n" "add x24, x13, %[input_row_stride]\n" - "add x15, x18, #64\n" + "add x15, x9, #64\n" "add x14, x21, %[output_row_stride]\n" "add x19, %[output_col_stride1], %[output_col_stride1]\n" "and x27, %[n_channels], #3\n" @@ -854,7 +854,7 @@ void Conv::execute_tile( "fmla v3.4s, v31.4s, v0.4s\n" "prfm pldl1keep, [x25, x20]\n" "fmla v2.4s, v31.4s, v15.4s\n" - "ldr q20, [%[inptr0], x18]\n" + "ldr q20, [%[inptr0], x9]\n" "fmla v1.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v7.4s, v28.4s, v16.4s\n" @@ -886,7 +886,7 @@ void Conv::execute_tile( "fmla v3.4s, v29.4s, v13.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v2.4s, v29.4s, v0.4s\n" - "ldr q22, [x25, x18]\n" + "ldr q22, [x25, x9]\n" "fmla v7.4s, v30.4s, v11.4s\n" "ldr q21, [x24, x16]\n" "fmla v1.4s, v25.4s, v10.4s\n" @@ -912,7 +912,7 @@ void Conv::execute_tile( "fmla v6.4s, v26.4s, v0.4s\n" "fmla v8.4s, v26.4s, v16.4s\n" "fmla v5.4s, v26.4s, v15.4s\n" - "ldr q26, [x22, x18]\n" + "ldr q26, [x22, x9]\n" "fmla v3.4s, v23.4s, v17.4s\n" "ldr q18, [x24, x26]\n" "fmla v9.4s, v23.4s, v13.4s\n" @@ -920,11 +920,11 @@ void Conv::execute_tile( "fmla v2.4s, v23.4s, v12.4s\n" "prfm pldl1keep, [x22, #64]\n" "fmla v8.4s, v23.4s, v0.4s\n" - "ldr q23, [x13, x18]\n" + "ldr q23, [x13, x9]\n" "fmla v7.4s, v28.4s, v10.4s\n" "prfm pldl1keep, [x22, x17]\n" "fmla v2.4s, v20.4s, v13.4s\n" - "ldr q25, [x24, x18]\n" + "ldr q25, [x24, x9]\n" "fmla v6.4s, v28.4s, v11.4s\n" "ldr q20, [%[wbptr]]\n" "fmla v1.4s, v27.4s, v14.4s\n" @@ -1036,7 +1036,7 @@ void Conv::execute_tile( "fmla v3.4s, v31.4s, v0.4s\n" "prfm pldl1keep, [x25, x20]\n" "fmla v2.4s, v31.4s, v15.4s\n" - "ldr q20, [%[inptr0], x18]\n" + "ldr q20, [%[inptr0], x9]\n" "fmla v1.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v7.4s, v28.4s, v16.4s\n" @@ -1068,7 +1068,7 @@ void Conv::execute_tile( "fmla v3.4s, v29.4s, v13.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v2.4s, v29.4s, v0.4s\n" - "ldr q22, [x25, x18]\n" + "ldr q22, [x25, x9]\n" "fmla v7.4s, v30.4s, v11.4s\n" "ldr q21, [x24, x16]\n" "fmla v1.4s, v25.4s, v10.4s\n" @@ -1088,7 +1088,7 @@ void Conv::execute_tile( "fmla v6.4s, v26.4s, v0.4s\n" "fmla v8.4s, v26.4s, v16.4s\n" "fmla v5.4s, v26.4s, v15.4s\n" - "ldr q26, [x22, x18]\n" + "ldr q26, [x22, x9]\n" "fmla v3.4s, v23.4s, v17.4s\n" "ldr q18, [x24, x26]\n" "fmla v9.4s, v23.4s, v13.4s\n" @@ -1096,9 +1096,9 @@ void Conv::execute_tile( "fmla v2.4s, v23.4s, v12.4s\n" "fmla v8.4s, v23.4s, v0.4s\n" "fmla v7.4s, v28.4s, v10.4s\n" - "ldr q23, [x13, x18]\n" + "ldr q23, [x13, x9]\n" "fmla v6.4s, v28.4s, v11.4s\n" - "ldr q25, [x24, x18]\n" + "ldr q25, [x24, x9]\n" "fmla v2.4s, v20.4s, v13.4s\n" "add x13, x13, #16\n" "fmla v1.4s, v27.4s, v14.4s\n" @@ -1220,7 +1220,7 @@ void Conv::execute_tile( "fmla v3.4s, v31.4s, v0.4s\n" "prfm pldl1keep, [x25, x20]\n" "fmla v2.4s, v31.4s, v15.4s\n" - "ldr s20, [%[inptr0], x18]\n" + "ldr s20, [%[inptr0], x9]\n" "fmla v1.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v7.4s, v28.4s, v16.4s\n" @@ -1252,7 +1252,7 @@ void Conv::execute_tile( "fmla v3.4s, v29.4s, v13.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v2.4s, v29.4s, v0.4s\n" - "ldr s22, [x25, x18]\n" + "ldr s22, [x25, x9]\n" "fmla v7.4s, v30.4s, v11.4s\n" "ldr s21, [x24, x16]\n" "fmla v1.4s, v25.4s, v10.4s\n" @@ -1278,7 +1278,7 @@ void Conv::execute_tile( "fmla v6.4s, v26.4s, v0.4s\n" "fmla v8.4s, v26.4s, v16.4s\n" "fmla v5.4s, v26.4s, v15.4s\n" - "ldr s26, [x22, x18]\n" + "ldr s26, [x22, x9]\n" "fmla v3.4s, v23.4s, v17.4s\n" "ldr s18, [x24, x26]\n" "fmla v9.4s, v23.4s, v13.4s\n" @@ -1286,11 +1286,11 @@ void Conv::execute_tile( "fmla v2.4s, v23.4s, v12.4s\n" "prfm pldl1keep, [x22, #64]\n" "fmla v8.4s, v23.4s, v0.4s\n" - "ldr s23, [x13, x18]\n" + "ldr s23, [x13, x9]\n" "fmla v7.4s, v28.4s, v10.4s\n" "prfm pldl1keep, [x22, x17]\n" "fmla v2.4s, v20.4s, v13.4s\n" - "ldr s25, [x24, x18]\n" + "ldr s25, [x24, x9]\n" "fmla v6.4s, v28.4s, v11.4s\n" "ldr s20, [%[wbptr]]\n" "fmla v1.4s, v27.4s, v14.4s\n" @@ -1402,7 +1402,7 @@ void Conv::execute_tile( "fmla v3.4s, v31.4s, v0.4s\n" "prfm pldl1keep, [x25, x20]\n" "fmla v2.4s, v31.4s, v15.4s\n" - "ldr s20, [%[inptr0], x18]\n" + "ldr s20, [%[inptr0], x9]\n" "fmla v1.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v7.4s, v28.4s, v16.4s\n" @@ -1434,7 +1434,7 @@ void Conv::execute_tile( "fmla v3.4s, v29.4s, v13.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v2.4s, v29.4s, v0.4s\n" - "ldr s22, [x25, x18]\n" + "ldr s22, [x25, x9]\n" "fmla v7.4s, v30.4s, v11.4s\n" "ldr s21, [x24, x16]\n" "fmla v1.4s, v25.4s, v10.4s\n" @@ -1454,7 +1454,7 @@ void Conv::execute_tile( "fmla v6.4s, v26.4s, v0.4s\n" "fmla v8.4s, v26.4s, v16.4s\n" "fmla v5.4s, v26.4s, v15.4s\n" - "ldr s26, [x22, x18]\n" + "ldr s26, [x22, x9]\n" "fmla v3.4s, v23.4s, v17.4s\n" "ldr s18, [x24, x26]\n" "fmla v9.4s, v23.4s, v13.4s\n" @@ -1462,9 +1462,9 @@ void Conv::execute_tile( "fmla v2.4s, v23.4s, v12.4s\n" "fmla v8.4s, v23.4s, v0.4s\n" "fmla v7.4s, v28.4s, v10.4s\n" - "ldr s23, [x13, x18]\n" + "ldr s23, [x13, x9]\n" "fmla v6.4s, v28.4s, v11.4s\n" - "ldr s25, [x24, x18]\n" + "ldr s25, [x24, x9]\n" "fmla v2.4s, v20.4s, v13.4s\n" "add x13, x13, #4\n" "fmla v1.4s, v27.4s, v14.4s\n" @@ -1522,7 +1522,7 @@ void Conv::execute_tile( "7:\n" : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" ); } @@ -1541,11 +1541,11 @@ void Conv::execute_tile( { __asm __volatile( "add x17, %[inptr0], %[input_row_stride]\n" - "add x18, %[input_col_stride1], %[input_col_stride1]\n" + "add x9, %[input_col_stride1], %[input_col_stride1]\n" "add x25, %[outptr0], %[output_row_stride]\n" "add x14, x17, %[input_row_stride]\n" - "add x22, x18, #64\n" - "add x15, x18, %[input_col_stride1]\n" + "add x22, x9, #64\n" + "add x15, x9, %[input_col_stride1]\n" "add x21, x14, %[input_row_stride]\n" "add x16, x15, #64\n" "add x24, x15, %[input_col_stride1]\n" @@ -1583,7 +1583,7 @@ void Conv::execute_tile( "ldr q26, [%[inptr0], %[input_col_stride1]]\n" "ldr q20, [x14]\n" "ldr q22, [x17, %[input_col_stride1]]\n" - "ldr q28, [%[inptr0], x18]\n" + "ldr q28, [%[inptr0], x9]\n" "ldr q23, [x21]\n" "fmla v8.4s, v27.4s, v14.4s\n" "ldr q18, [x14, %[input_col_stride1]]\n" @@ -1598,7 +1598,7 @@ void Conv::execute_tile( "beq 3f\n" "2:\n" "fmla v5.4s, v27.4s, v17.4s\n" - "ldr q27, [x17, x18]\n" + "ldr q27, [x17, x9]\n" "fmla v8.4s, v26.4s, v16.4s\n" "ldr q30, [%[inptr0], x15]\n" "fmla v7.4s, v26.4s, v17.4s\n" @@ -1608,7 +1608,7 @@ void Conv::execute_tile( "fmla v8.4s, v20.4s, v11.4s\n" "prfm pldl1keep, [x17, x22]\n" "fmla v2.4s, v20.4s, v17.4s\n" - "ldr q29, [x14, x18]\n" + "ldr q29, [x14, x9]\n" "fmla v5.4s, v22.4s, v16.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v8.4s, v22.4s, v13.4s\n" @@ -1638,7 +1638,7 @@ void Conv::execute_tile( "fmla v4.4s, v18.4s, v14.4s\n" "prfm pldl1keep, [x26, x22]\n" "fmla v1.4s, v18.4s, v17.4s\n" - "ldr q25, [x21, x18]\n" + "ldr q25, [x21, x9]\n" "fmla v8.4s, v27.4s, v12.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v5.4s, v27.4s, v15.4s\n" @@ -1656,7 +1656,7 @@ void Conv::execute_tile( "fmla v6.4s, v30.4s, v16.4s\n" "ldr q26, [x17, x24]\n" "fmla v2.4s, v31.4s, v11.4s\n" - "ldr q20, [x26, x18]\n" + "ldr q20, [x26, x9]\n" "fmla v5.4s, v24.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v4.4s, v24.4s, v11.4s\n" @@ -1754,7 +1754,7 @@ void Conv::execute_tile( "fmin v5.4s, v5.4s, v29.4s\n" "ldr q12, [%[wbptr], #96]\n" "fmax v4.4s, v4.4s, v30.4s\n" - "ldr q28, [%[inptr0], x18]\n" + "ldr q28, [%[inptr0], x9]\n" "str q5, [x25]\n" "fmax v3.4s, v3.4s, v30.4s\n" "fmin v4.4s, v4.4s, v29.4s\n" @@ -1790,7 +1790,7 @@ void Conv::execute_tile( "bne 2b\n" "3:\n" "fmla v5.4s, v27.4s, v17.4s\n" - "ldr q27, [x17, x18]\n" + "ldr q27, [x17, x9]\n" "fmla v8.4s, v26.4s, v16.4s\n" "ldr q30, [%[inptr0], x15]\n" "fmla v7.4s, v26.4s, v17.4s\n" @@ -1800,7 +1800,7 @@ void Conv::execute_tile( "fmla v8.4s, v20.4s, v11.4s\n" "prfm pldl1keep, [x17, x22]\n" "fmla v2.4s, v20.4s, v17.4s\n" - "ldr q29, [x14, x18]\n" + "ldr q29, [x14, x9]\n" "fmla v5.4s, v22.4s, v16.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v8.4s, v22.4s, v13.4s\n" @@ -1830,7 +1830,7 @@ void Conv::execute_tile( "fmla v4.4s, v18.4s, v14.4s\n" "prfm pldl1keep, [x26, x22]\n" "fmla v1.4s, v18.4s, v17.4s\n" - "ldr q25, [x21, x18]\n" + "ldr q25, [x21, x9]\n" "fmla v8.4s, v27.4s, v12.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v5.4s, v27.4s, v15.4s\n" @@ -1848,7 +1848,7 @@ void Conv::execute_tile( "fmla v6.4s, v30.4s, v16.4s\n" "ldr q26, [x17, x24]\n" "fmla v2.4s, v31.4s, v11.4s\n" - "ldr q20, [x26, x18]\n" + "ldr q20, [x26, x9]\n" "fmla v5.4s, v24.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v4.4s, v24.4s, v11.4s\n" @@ -1969,7 +1969,7 @@ void Conv::execute_tile( "ldr s26, [%[inptr0], %[input_col_stride1]]\n" "ldr s20, [x14]\n" "ldr s22, [x17, %[input_col_stride1]]\n" - "ldr s28, [%[inptr0], x18]\n" + "ldr s28, [%[inptr0], x9]\n" "fmla v8.4s, v27.4s, v14.4s\n" "ldr s23, [x21]\n" "ldr s18, [x14, %[input_col_stride1]]\n" @@ -1984,7 +1984,7 @@ void Conv::execute_tile( "beq 6f\n" "5:\n" "fmla v5.4s, v27.4s, v17.4s\n" - "ldr s27, [x17, x18]\n" + "ldr s27, [x17, x9]\n" "fmla v8.4s, v26.4s, v16.4s\n" "ldr s30, [%[inptr0], x15]\n" "fmla v7.4s, v26.4s, v17.4s\n" @@ -1994,7 +1994,7 @@ void Conv::execute_tile( "fmla v8.4s, v20.4s, v11.4s\n" "prfm pldl1keep, [x17, x22]\n" "fmla v2.4s, v20.4s, v17.4s\n" - "ldr s29, [x14, x18]\n" + "ldr s29, [x14, x9]\n" "fmla v5.4s, v22.4s, v16.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v8.4s, v22.4s, v13.4s\n" @@ -2024,7 +2024,7 @@ void Conv::execute_tile( "fmla v4.4s, v18.4s, v14.4s\n" "prfm pldl1keep, [x26, x22]\n" "fmla v1.4s, v18.4s, v17.4s\n" - "ldr s25, [x21, x18]\n" + "ldr s25, [x21, x9]\n" "fmla v8.4s, v27.4s, v12.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v5.4s, v27.4s, v15.4s\n" @@ -2042,7 +2042,7 @@ void Conv::execute_tile( "fmla v6.4s, v30.4s, v16.4s\n" "ldr s26, [x17, x24]\n" "fmla v2.4s, v31.4s, v11.4s\n" - "ldr s20, [x26, x18]\n" + "ldr s20, [x26, x9]\n" "fmla v5.4s, v24.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v4.4s, v24.4s, v11.4s\n" @@ -2140,7 +2140,7 @@ void Conv::execute_tile( "fmin v5.4s, v5.4s, v29.4s\n" "ldr s12, [%[wbptr], #24]\n" "fmax v4.4s, v4.4s, v30.4s\n" - "ldr s28, [%[inptr0], x18]\n" + "ldr s28, [%[inptr0], x9]\n" "str s5, [x25]\n" "fmax v3.4s, v3.4s, v30.4s\n" "fmin v4.4s, v4.4s, v29.4s\n" @@ -2176,7 +2176,7 @@ void Conv::execute_tile( "bne 5b\n" "6:\n" "fmla v5.4s, v27.4s, v17.4s\n" - "ldr s27, [x17, x18]\n" + "ldr s27, [x17, x9]\n" "fmla v8.4s, v26.4s, v16.4s\n" "ldr s30, [%[inptr0], x15]\n" "fmla v7.4s, v26.4s, v17.4s\n" @@ -2186,7 +2186,7 @@ void Conv::execute_tile( "fmla v8.4s, v20.4s, v11.4s\n" "prfm pldl1keep, [x17, x22]\n" "fmla v2.4s, v20.4s, v17.4s\n" - "ldr s29, [x14, x18]\n" + "ldr s29, [x14, x9]\n" "fmla v5.4s, v22.4s, v16.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v8.4s, v22.4s, v13.4s\n" @@ -2216,7 +2216,7 @@ void Conv::execute_tile( "fmla v4.4s, v18.4s, v14.4s\n" "prfm pldl1keep, [x26, x22]\n" "fmla v1.4s, v18.4s, v17.4s\n" - "ldr s25, [x21, x18]\n" + "ldr s25, [x21, x9]\n" "fmla v8.4s, v27.4s, v12.4s\n" "prfm pldl1keep, [x21, x16]\n" "fmla v5.4s, v27.4s, v15.4s\n" @@ -2234,7 +2234,7 @@ void Conv::execute_tile( "fmla v6.4s, v30.4s, v16.4s\n" "ldr s26, [x17, x24]\n" "fmla v2.4s, v31.4s, v11.4s\n" - "ldr s20, [x26, x18]\n" + "ldr s20, [x26, x9]\n" "fmla v5.4s, v24.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v4.4s, v24.4s, v11.4s\n" @@ -2330,7 +2330,7 @@ void Conv::execute_tile( "7:\n" : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" ); } -- cgit v1.2.1