From ce3a7b27f80960e88415bb6cabbb75de2239cea8 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 10 Mar 2020 15:33:57 +0000 Subject: COMPMID-3259: Fix scalar register allocation The Aarch64 ABI reserves X18 for platform ABIs, replace all references to X18 with a different register which doesn't have a special purpose. Signed-off-by: Georgios Pinitas Change-Id: Ia9e059d44c5edda216bea169d0418bb7a8c4311b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2863 Tested-by: Arm Jenkins Reviewed-by: Nikhil Raj Arm Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp | 344 ++++++++++----------- 1 file changed, 172 insertions(+), 172 deletions(-) (limited to 'src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp') diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp index a583615c99..d95332b828 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp @@ -51,7 +51,7 @@ void Conv::execute_tile( "add x16, x15, #64\n" "add x17, x15, %[input_col_stride1]\n" "add x10, x9, %[input_row_stride]\n" - "add x18, x17, #64\n" + "add x7, x17, #64\n" "add x19, x17, %[input_col_stride1]\n" "add x11, x10, %[input_row_stride]\n" "add x20, x19, #64\n" @@ -111,7 +111,7 @@ void Conv::execute_tile( "fmla v23.4s, v28.4s, v12.4s\n" "ldr q22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr q29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" @@ -123,7 +123,7 @@ void Conv::execute_tile( "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x18]\n" + "prfm pldl1keep, [x8, x7]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" @@ -137,7 +137,7 @@ void Conv::execute_tile( "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr q21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" @@ -149,7 +149,7 @@ void Conv::execute_tile( "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x18]\n" + "prfm pldl1keep, [x10, x7]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr q27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" @@ -159,7 +159,7 @@ void Conv::execute_tile( "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x18]\n" + "prfm pldl1keep, [x11, x7]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" @@ -167,7 +167,7 @@ void Conv::execute_tile( "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x18]\n" + "prfm pldl1keep, [x12, x7]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr q22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" @@ -384,7 +384,7 @@ void Conv::execute_tile( "fmla v23.4s, v28.4s, v12.4s\n" "ldr q22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr q29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" @@ -396,7 +396,7 @@ void Conv::execute_tile( "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x18]\n" + "prfm pldl1keep, [x8, x7]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" @@ -410,7 +410,7 @@ void Conv::execute_tile( "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr q21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" @@ -422,7 +422,7 @@ void Conv::execute_tile( "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x18]\n" + "prfm pldl1keep, [x10, x7]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr q27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" @@ -432,7 +432,7 @@ void Conv::execute_tile( "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x18]\n" + "prfm pldl1keep, [x11, x7]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" @@ -440,7 +440,7 @@ void Conv::execute_tile( "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x18]\n" + "prfm pldl1keep, [x12, x7]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr q22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" @@ -659,7 +659,7 @@ void Conv::execute_tile( "fmla v23.4s, v28.4s, v12.4s\n" "ldr s22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr s29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" @@ -671,7 +671,7 @@ void Conv::execute_tile( "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x18]\n" + "prfm pldl1keep, [x8, x7]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" @@ -685,7 +685,7 @@ void Conv::execute_tile( "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr s21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" @@ -697,7 +697,7 @@ void Conv::execute_tile( "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x18]\n" + "prfm pldl1keep, [x10, x7]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr s27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" @@ -707,7 +707,7 @@ void Conv::execute_tile( "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x18]\n" + "prfm pldl1keep, [x11, x7]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" @@ -715,7 +715,7 @@ void Conv::execute_tile( "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x18]\n" + "prfm pldl1keep, [x12, x7]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr s22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" @@ -932,7 +932,7 @@ void Conv::execute_tile( "fmla v23.4s, v28.4s, v12.4s\n" "ldr s22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr s29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" @@ -944,7 +944,7 @@ void Conv::execute_tile( "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x18]\n" + "prfm pldl1keep, [x8, x7]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" @@ -958,7 +958,7 @@ void Conv::execute_tile( "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr s21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" @@ -970,7 +970,7 @@ void Conv::execute_tile( "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x18]\n" + "prfm pldl1keep, [x10, x7]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr s27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" @@ -980,7 +980,7 @@ void Conv::execute_tile( "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x18]\n" + "prfm pldl1keep, [x11, x7]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" @@ -988,7 +988,7 @@ void Conv::execute_tile( "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x18]\n" + "prfm pldl1keep, [x12, x7]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr s22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" @@ -1163,7 +1163,7 @@ void Conv::execute_tile( "7:\n" : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" ); } @@ -1204,16 +1204,16 @@ void Conv::execute_tile( "mov v2.16b, v13.16b\n" "ldr q4, [%[wbptr], #144]\n" "ldr q29, [x17, x27]\n" - "ldr x18, [%[inptrs], 48]\n" + "ldr x7, [%[inptrs], 48]\n" "fmla v18.4s, v29.4s, v12.4s\n" "ldr x17, [%[inptrs], 8]\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "ldr x19, [%[inptrs], 96]\n" "ldr q28, [x17, x27]\n" - "ldr x18, [%[inptrs], 56]\n" + "ldr x7, [%[inptrs], 56]\n" "ldr q25, [x19, x27]\n" "ldr x17, [%[inptrs], 16]\n" - "ldr q16, [x18, x27]\n" + "ldr q16, [x7, x27]\n" "ldr x20, [%[inptrs], 144]\n" "ldr q15, [x17, x27]\n" "ldr x19, [%[inptrs], 104]\n" @@ -1223,11 +1223,11 @@ void Conv::execute_tile( "beq 3f\n" "2:\n" "mov v3.16b, v13.16b\n" - "ldr x18, [%[inptrs], 64]\n" + "ldr x7, [%[inptrs], 64]\n" "fmla v18.4s, v27.4s, v9.4s\n" "ldr x17, [%[inptrs], 24]\n" "fmla v22.4s, v27.4s, v12.4s\n" - "ldr q30, [x18, x27]\n" + "ldr q30, [x7, x27]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr x21, [%[inptrs], 192]\n" "fmla v19.4s, v25.4s, v12.4s\n" @@ -1237,7 +1237,7 @@ void Conv::execute_tile( "fmla v22.4s, v25.4s, v9.4s\n" "ldr x19, [%[inptrs], 112]\n" "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x18, [%[inptrs], 72]\n" + "ldr x7, [%[inptrs], 72]\n" "fmla v17.4s, v16.4s, v12.4s\n" "ldr x17, [%[inptrs], 32]\n" "fmla v18.4s, v25.4s, v6.4s\n" @@ -1257,11 +1257,11 @@ void Conv::execute_tile( "fmla v0.4s, v21.4s, v12.4s\n" "ldr q21, [x19, x27]\n" "fmla v18.4s, v15.4s, v10.4s\n" - "ldr q20, [x18, x27]\n" + "ldr q20, [x7, x27]\n" "fmla v22.4s, v29.4s, v8.4s\n" "ldr x19, [%[inptrs], 120]\n" "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x18, [%[inptrs], 80]\n" + "ldr x7, [%[inptrs], 80]\n" "fmla v19.4s, v29.4s, v11.4s\n" "ldr x25, [%[outptrs], 64]\n" "fmla v18.4s, v29.4s, v5.4s\n" @@ -1321,13 +1321,13 @@ void Conv::execute_tile( "fmla v1.4s, v30.4s, v6.4s\n" "fmla v16.4s, v30.4s, v9.4s\n" "fmla v3.4s, v26.4s, v11.4s\n" - "ldr q29, [x18, x27]\n" + "ldr q29, [x7, x27]\n" "fmla v15.4s, v21.4s, v12.4s\n" "ldr q27, [x17, x27]\n" "fmla v0.4s, v30.4s, v8.4s\n" "ldr q28, [x22, x27]\n" "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x18, [%[inptrs], 88]\n" + "ldr x7, [%[inptrs], 88]\n" "fmla v19.4s, v24.4s, v7.4s\n" "ldr x22, [%[inptrs], 256]\n" "fmla v17.4s, v24.4s, v5.4s\n" @@ -1365,13 +1365,13 @@ void Conv::execute_tile( "fmla v0.4s, v28.4s, v5.4s\n" "ldr x19, [%[inptrs], 136]\n" "fmla v16.4s, v28.4s, v6.4s\n" - "ldr q26, [x18, x27]\n" + "ldr q26, [x7, x27]\n" "fmla v3.4s, v27.4s, v10.4s\n" "ldr q23, [x22, x27]\n" "fmla v19.4s, v22.4s, v4.4s\n" "ldr x22, [%[inptrs], 264]\n" "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x18, [%[inptrs], 48]\n" + "ldr x7, [%[inptrs], 48]\n" "fmla v1.4s, v22.4s, v5.4s\n" "fmla v16.4s, v22.4s, v8.4s\n" "fmla v15.4s, v22.4s, v6.4s\n" @@ -1435,7 +1435,7 @@ void Conv::execute_tile( "fmla v2.4s, v28.4s, v4.4s\n" "ldr q29, [x17, x27]\n" "fmla v15.4s, v28.4s, v7.4s\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "fmla v18.4s, v28.4s, v5.4s\n" "ldr x25, [%[outptrs], 80]\n" "fmla v21.4s, v28.4s, v10.4s\n" @@ -1455,13 +1455,13 @@ void Conv::execute_tile( "str q3, [x23, x28]\n" "fmla v21.4s, v30.4s, v5.4s\n" "fmla v20.4s, v30.4s, v6.4s\n" - "ldr x18, [%[inptrs], 56]\n" + "ldr x7, [%[inptrs], 56]\n" "fmla v15.4s, v19.4s, v4.4s\n" "ldr x17, [%[inptrs], 16]\n" "str q16, [x26, x28]\n" "fmla v24.4s, v19.4s, v5.4s\n" "fmla v21.4s, v19.4s, v7.4s\n" - "ldr q16, [x18, x27]\n" + "ldr q16, [x7, x27]\n" "fmla v20.4s, v19.4s, v8.4s\n" "ldr q6, [%[wbptr], #112]\n" "str q15, [x25, x28]\n" @@ -1504,11 +1504,11 @@ void Conv::execute_tile( "bne 2b\n" "3:\n" "mov v3.16b, v13.16b\n" - "ldr x18, [%[inptrs], 64]\n" + "ldr x7, [%[inptrs], 64]\n" "fmla v18.4s, v27.4s, v9.4s\n" "ldr x17, [%[inptrs], 24]\n" "fmla v22.4s, v27.4s, v12.4s\n" - "ldr q30, [x18, x27]\n" + "ldr q30, [x7, x27]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr x21, [%[inptrs], 192]\n" "fmla v19.4s, v25.4s, v12.4s\n" @@ -1518,7 +1518,7 @@ void Conv::execute_tile( "fmla v22.4s, v25.4s, v9.4s\n" "ldr x19, [%[inptrs], 112]\n" "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x18, [%[inptrs], 72]\n" + "ldr x7, [%[inptrs], 72]\n" "fmla v17.4s, v16.4s, v12.4s\n" "ldr x17, [%[inptrs], 32]\n" "fmla v18.4s, v25.4s, v6.4s\n" @@ -1538,11 +1538,11 @@ void Conv::execute_tile( "fmla v0.4s, v21.4s, v12.4s\n" "ldr q21, [x19, x27]\n" "fmla v18.4s, v15.4s, v10.4s\n" - "ldr q20, [x18, x27]\n" + "ldr q20, [x7, x27]\n" "fmla v22.4s, v29.4s, v8.4s\n" "ldr x19, [%[inptrs], 120]\n" "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x18, [%[inptrs], 80]\n" + "ldr x7, [%[inptrs], 80]\n" "fmla v19.4s, v29.4s, v11.4s\n" "ldr x25, [%[outptrs], 64]\n" "fmla v18.4s, v29.4s, v5.4s\n" @@ -1598,13 +1598,13 @@ void Conv::execute_tile( "fmla v18.4s, v20.4s, v12.4s\n" "ldr q25, [x19, x27]\n" "fmla v0.4s, v27.4s, v6.4s\n" - "ldr q29, [x18, x27]\n" + "ldr q29, [x7, x27]\n" "fmla v14.4s, v26.4s, v10.4s\n" "ldr x19, [%[inptrs], 128]\n" "fmla v3.4s, v26.4s, v11.4s\n" "ldr q27, [x17, x27]\n" "fmla v19.4s, v30.4s, v5.4s\n" - "ldr x18, [%[inptrs], 88]\n" + "ldr x7, [%[inptrs], 88]\n" "fmla v0.4s, v30.4s, v8.4s\n" "fmla v1.4s, v30.4s, v6.4s\n" "fmla v16.4s, v30.4s, v9.4s\n" @@ -1646,7 +1646,7 @@ void Conv::execute_tile( "fmla v18.4s, v29.4s, v11.4s\n" "ldr q31, [x19, x27]\n" "fmla v16.4s, v28.4s, v6.4s\n" - "ldr q26, [x18, x27]\n" + "ldr q26, [x7, x27]\n" "fmla v19.4s, v22.4s, v4.4s\n" "ldr x19, [%[inptrs], 136]\n" "fmla v3.4s, v27.4s, v10.4s\n" @@ -1767,31 +1767,31 @@ void Conv::execute_tile( "mov v2.16b, v13.16b\n" "ldr s4, [%[wbptr], #36]\n" "ldr x17, [%[inptrs], 0]\n" - "ldr x18, [%[inptrs], 48]\n" + "ldr x7, [%[inptrs], 48]\n" "ldr x19, [%[inptrs], 96]\n" "ldr x20, [%[inptrs], 144]\n" "subs x15, x15, #1\n" "ldr s29, [x17, x27]\n" "fmla v18.4s, v29.4s, v12.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "ldr s25, [x19, x27]\n" "ldr x17, [%[inptrs], 8]\n" "ldr s21, [x20, x27]\n" - "ldr x18, [%[inptrs], 56]\n" + "ldr x7, [%[inptrs], 56]\n" "ldr s28, [x17, x27]\n" "ldr x19, [%[inptrs], 104]\n" - "ldr s16, [x18, x27]\n" + "ldr s16, [x7, x27]\n" "ldr x17, [%[inptrs], 16]\n" "ldr s29, [x19, x27]\n" "ldr s15, [x17, x27]\n" "beq 6f\n" "5:\n" "mov v3.16b, v13.16b\n" - "ldr x18, [%[inptrs], 64]\n" + "ldr x7, [%[inptrs], 64]\n" "fmla v18.4s, v27.4s, v9.4s\n" "ldr x17, [%[inptrs], 24]\n" "fmla v22.4s, v27.4s, v12.4s\n" - "ldr s30, [x18, x27]\n" + "ldr s30, [x7, x27]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr x21, [%[inptrs], 192]\n" "fmla v19.4s, v25.4s, v12.4s\n" @@ -1801,7 +1801,7 @@ void Conv::execute_tile( "fmla v22.4s, v25.4s, v9.4s\n" "ldr x19, [%[inptrs], 112]\n" "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x18, [%[inptrs], 72]\n" + "ldr x7, [%[inptrs], 72]\n" "fmla v17.4s, v16.4s, v12.4s\n" "ldr x17, [%[inptrs], 32]\n" "fmla v18.4s, v25.4s, v6.4s\n" @@ -1821,11 +1821,11 @@ void Conv::execute_tile( "fmla v0.4s, v21.4s, v12.4s\n" "ldr s21, [x19, x27]\n" "fmla v18.4s, v15.4s, v10.4s\n" - "ldr s20, [x18, x27]\n" + "ldr s20, [x7, x27]\n" "fmla v22.4s, v29.4s, v8.4s\n" "ldr x19, [%[inptrs], 120]\n" "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x18, [%[inptrs], 80]\n" + "ldr x7, [%[inptrs], 80]\n" "fmla v19.4s, v29.4s, v11.4s\n" "ldr x25, [%[outptrs], 64]\n" "fmla v18.4s, v29.4s, v5.4s\n" @@ -1885,13 +1885,13 @@ void Conv::execute_tile( "fmla v1.4s, v30.4s, v6.4s\n" "fmla v16.4s, v30.4s, v9.4s\n" "fmla v3.4s, v26.4s, v11.4s\n" - "ldr s29, [x18, x27]\n" + "ldr s29, [x7, x27]\n" "fmla v15.4s, v21.4s, v12.4s\n" "ldr s27, [x17, x27]\n" "fmla v0.4s, v30.4s, v8.4s\n" "ldr s28, [x22, x27]\n" "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x18, [%[inptrs], 88]\n" + "ldr x7, [%[inptrs], 88]\n" "fmla v19.4s, v24.4s, v7.4s\n" "ldr x22, [%[inptrs], 256]\n" "fmla v17.4s, v24.4s, v5.4s\n" @@ -1929,13 +1929,13 @@ void Conv::execute_tile( "fmla v0.4s, v28.4s, v5.4s\n" "ldr x19, [%[inptrs], 136]\n" "fmla v16.4s, v28.4s, v6.4s\n" - "ldr s26, [x18, x27]\n" + "ldr s26, [x7, x27]\n" "fmla v3.4s, v27.4s, v10.4s\n" "ldr s23, [x22, x27]\n" "fmla v19.4s, v22.4s, v4.4s\n" "ldr x22, [%[inptrs], 264]\n" "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x18, [%[inptrs], 48]\n" + "ldr x7, [%[inptrs], 48]\n" "fmla v1.4s, v22.4s, v5.4s\n" "fmla v16.4s, v22.4s, v8.4s\n" "fmla v15.4s, v22.4s, v6.4s\n" @@ -1999,7 +1999,7 @@ void Conv::execute_tile( "fmla v2.4s, v28.4s, v4.4s\n" "ldr s29, [x17, x27]\n" "fmla v15.4s, v28.4s, v7.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "fmla v18.4s, v28.4s, v5.4s\n" "ldr x25, [%[outptrs], 80]\n" "fmla v21.4s, v28.4s, v10.4s\n" @@ -2019,13 +2019,13 @@ void Conv::execute_tile( "str s3, [x23, x28]\n" "fmla v21.4s, v30.4s, v5.4s\n" "fmla v20.4s, v30.4s, v6.4s\n" - "ldr x18, [%[inptrs], 56]\n" + "ldr x7, [%[inptrs], 56]\n" "fmla v15.4s, v19.4s, v4.4s\n" "ldr x17, [%[inptrs], 16]\n" "str s16, [x26, x28]\n" "fmla v24.4s, v19.4s, v5.4s\n" "fmla v21.4s, v19.4s, v7.4s\n" - "ldr s16, [x18, x27]\n" + "ldr s16, [x7, x27]\n" "fmla v20.4s, v19.4s, v8.4s\n" "ldr s6, [%[wbptr], #28]\n" "str s15, [x25, x28]\n" @@ -2068,11 +2068,11 @@ void Conv::execute_tile( "bne 5b\n" "6:\n" "mov v3.16b, v13.16b\n" - "ldr x18, [%[inptrs], 64]\n" + "ldr x7, [%[inptrs], 64]\n" "fmla v18.4s, v27.4s, v9.4s\n" "ldr x17, [%[inptrs], 24]\n" "fmla v22.4s, v27.4s, v12.4s\n" - "ldr s30, [x18, x27]\n" + "ldr s30, [x7, x27]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr x21, [%[inptrs], 192]\n" "fmla v19.4s, v25.4s, v12.4s\n" @@ -2082,7 +2082,7 @@ void Conv::execute_tile( "fmla v22.4s, v25.4s, v9.4s\n" "ldr x19, [%[inptrs], 112]\n" "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x18, [%[inptrs], 72]\n" + "ldr x7, [%[inptrs], 72]\n" "fmla v17.4s, v16.4s, v12.4s\n" "ldr x17, [%[inptrs], 32]\n" "fmla v18.4s, v25.4s, v6.4s\n" @@ -2102,11 +2102,11 @@ void Conv::execute_tile( "fmla v0.4s, v21.4s, v12.4s\n" "ldr s21, [x19, x27]\n" "fmla v18.4s, v15.4s, v10.4s\n" - "ldr s20, [x18, x27]\n" + "ldr s20, [x7, x27]\n" "fmla v22.4s, v29.4s, v8.4s\n" "ldr x19, [%[inptrs], 120]\n" "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x18, [%[inptrs], 80]\n" + "ldr x7, [%[inptrs], 80]\n" "fmla v19.4s, v29.4s, v11.4s\n" "ldr x25, [%[outptrs], 64]\n" "fmla v18.4s, v29.4s, v5.4s\n" @@ -2162,13 +2162,13 @@ void Conv::execute_tile( "fmla v18.4s, v20.4s, v12.4s\n" "ldr s25, [x19, x27]\n" "fmla v0.4s, v27.4s, v6.4s\n" - "ldr s29, [x18, x27]\n" + "ldr s29, [x7, x27]\n" "fmla v14.4s, v26.4s, v10.4s\n" "ldr x19, [%[inptrs], 128]\n" "fmla v3.4s, v26.4s, v11.4s\n" "ldr s27, [x17, x27]\n" "fmla v19.4s, v30.4s, v5.4s\n" - "ldr x18, [%[inptrs], 88]\n" + "ldr x7, [%[inptrs], 88]\n" "fmla v0.4s, v30.4s, v8.4s\n" "fmla v1.4s, v30.4s, v6.4s\n" "fmla v16.4s, v30.4s, v9.4s\n" @@ -2210,7 +2210,7 @@ void Conv::execute_tile( "fmla v18.4s, v29.4s, v11.4s\n" "ldr s31, [x19, x27]\n" "fmla v16.4s, v28.4s, v6.4s\n" - "ldr s26, [x18, x27]\n" + "ldr s26, [x7, x27]\n" "fmla v19.4s, v22.4s, v4.4s\n" "ldr x19, [%[inptrs], 136]\n" "fmla v3.4s, v27.4s, v10.4s\n" @@ -2312,7 +2312,7 @@ void Conv::execute_tile( "7:\n" : [wbptr] "+r" (weight_bias_ptr) : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" ); } @@ -2345,7 +2345,7 @@ void Conv::execute_tile( "add x27, x10, %[input_row_stride]\n" "add x15, x14, #64\n" "add x17, x16, %[output_row_stride]\n" - "add x18, x17, %[output_row_stride]\n" + "add x7, x17, %[output_row_stride]\n" "add x19, %[output_col_stride1], %[output_col_stride1]\n" "and x21, %[n_channels], #3\n" "add x20, x19, %[output_col_stride1]\n" @@ -2649,13 +2649,13 @@ void Conv::execute_tile( "fmax v0.4s, v0.4s, v29.4s\n" "str q25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" - "str q0, [x18]\n" + "str q0, [x7]\n" "fmax v23.4s, v23.4s, v29.4s\n" - "str q18, [x18, %[output_col_stride1]]\n" + "str q18, [x7, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" - "str q23, [x18, x19]\n" + "str q23, [x7, x19]\n" "mov v7.16b, v21.16b\n" - "str q24, [x18, x20]\n" + "str q24, [x7, x20]\n" "mov v3.16b, v21.16b\n" "mov v6.16b, v21.16b\n" "ldr q9, [%[wbptr], #128]\n" @@ -2684,7 +2684,7 @@ void Conv::execute_tile( "fmla v3.4s, v18.4s, v17.4s\n" "add x17, x17, #16\n" "fmla v15.4s, v18.4s, v20.4s\n" - "add x18, x18, #16\n" + "add x7, x7, #16\n" "fmla v7.4s, v23.4s, v14.4s\n" "fmla v3.4s, v27.4s, v14.4s\n" "fmla v7.4s, v18.4s, v10.4s\n" @@ -2923,15 +2923,15 @@ void Conv::execute_tile( "fmax v0.4s, v0.4s, v29.4s\n" "str q25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" - "str q0, [x18]\n" + "str q0, [x7]\n" "fmax v23.4s, v23.4s, v29.4s\n" - "str q18, [x18, %[output_col_stride1]]\n" + "str q18, [x7, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" - "str q23, [x18, x19]\n" + "str q23, [x7, x19]\n" "add x16, x16, #16\n" - "str q24, [x18, x20]\n" + "str q24, [x7, x20]\n" "add x17, x17, #16\n" - "add x18, x18, #16\n" + "add x7, x7, #16\n" "4:\n" "cbz x21, 7f\n" "ldr s21, [%[wbptr]]\n" @@ -3231,13 +3231,13 @@ void Conv::execute_tile( "fmax v0.4s, v0.4s, v29.4s\n" "str s25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" - "str s0, [x18]\n" + "str s0, [x7]\n" "fmax v23.4s, v23.4s, v29.4s\n" - "str s18, [x18, %[output_col_stride1]]\n" + "str s18, [x7, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" - "str s23, [x18, x19]\n" + "str s23, [x7, x19]\n" "mov v7.16b, v21.16b\n" - "str s24, [x18, x20]\n" + "str s24, [x7, x20]\n" "mov v3.16b, v21.16b\n" "mov v6.16b, v21.16b\n" "ldr s9, [%[wbptr], #32]\n" @@ -3266,7 +3266,7 @@ void Conv::execute_tile( "fmla v3.4s, v18.4s, v17.4s\n" "add x17, x17, #4\n" "fmla v15.4s, v18.4s, v20.4s\n" - "add x18, x18, #4\n" + "add x7, x7, #4\n" "fmla v7.4s, v23.4s, v14.4s\n" "fmla v3.4s, v27.4s, v14.4s\n" "fmla v7.4s, v18.4s, v10.4s\n" @@ -3505,19 +3505,19 @@ void Conv::execute_tile( "fmax v0.4s, v0.4s, v29.4s\n" "str s25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" - "str s0, [x18]\n" + "str s0, [x7]\n" "fmax v23.4s, v23.4s, v29.4s\n" - "str s18, [x18, %[output_col_stride1]]\n" + "str s18, [x7, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" - "str s23, [x18, x19]\n" + "str s23, [x7, x19]\n" "add x16, x16, #4\n" - "str s24, [x18, x20]\n" + "str s24, [x7, x20]\n" "add x17, x17, #4\n" - "add x18, x18, #4\n" + "add x7, x7, #4\n" "7:\n" : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" ); } @@ -3570,11 +3570,11 @@ void Conv::execute_tile( "fmla v2.4s, v26.4s, v19.4s\n" "ldr x25, [%[inptrs], 16]\n" "ldr q29, [x17, x27]\n" - "ldr x18, [%[inptrs], 144]\n" + "ldr x7, [%[inptrs], 144]\n" "ldr x24, [%[inptrs], 104]\n" "subs x26, x26, #1\n" "ldr q30, [x25, x27]\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "ldr q21, [x24, x27]\n" "fmla v2.4s, v31.4s, v9.4s\n" "beq 3f\n" @@ -3588,7 +3588,7 @@ void Conv::execute_tile( "fmla v2.4s, v28.4s, v5.4s\n" "ldr x15, [%[inptrs], 192]\n" "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x18, [%[inptrs], 152]\n" + "ldr x7, [%[inptrs], 152]\n" "fmla v13.4s, v28.4s, v22.4s\n" "ldr q26, [x25, x27]\n" "fmla v18.4s, v29.4s, v19.4s\n" @@ -3604,9 +3604,9 @@ void Conv::execute_tile( "fmla v2.4s, v30.4s, v8.4s\n" "ldr x15, [%[inptrs], 200]\n" "fmla v17.4s, v30.4s, v22.4s\n" - "ldr q29, [x18, x27]\n" + "ldr q29, [x7, x27]\n" "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x18, [%[inptrs], 160]\n" + "ldr x7, [%[inptrs], 160]\n" "fmla v13.4s, v27.4s, v19.4s\n" "ldr x20, [%[outptrs], 0]\n" "fmla v14.4s, v27.4s, v22.4s\n" @@ -3668,9 +3668,9 @@ void Conv::execute_tile( "mov v20.16b, v25.16b\n" "fmla v15.4s, v24.4s, v9.4s\n" "fmla v21.4s, v24.4s, v22.4s\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x18, [%[inptrs], 168]\n" + "ldr x7, [%[inptrs], 168]\n" "fmla v17.4s, v23.4s, v8.4s\n" "ldr q30, [x24, x27]\n" "fmla v13.4s, v26.4s, v4.4s\n" @@ -3712,11 +3712,11 @@ void Conv::execute_tile( "fmla v10.4s, v26.4s, v5.4s\n" "ldr q31, [x15, x27]\n" "fmla v1.4s, v25.4s, v8.4s\n" - "ldr q29, [x18, x27]\n" + "ldr q29, [x7, x27]\n" "fmla v13.4s, v31.4s, v3.4s\n" "ldr x15, [%[inptrs], 216]\n" "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x18, [%[inptrs], 176]\n" + "ldr x7, [%[inptrs], 176]\n" "fmla v12.4s, v31.4s, v4.4s\n" "fmla v10.4s, v31.4s, v7.4s\n" "fmla v11.4s, v31.4s, v5.4s\n" @@ -3748,11 +3748,11 @@ void Conv::execute_tile( "fmla v1.4s, v25.4s, v6.4s\n" "fmla v10.4s, v29.4s, v4.4s\n" "fmla v21.4s, v25.4s, v8.4s\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "fmla v20.4s, v29.4s, v5.4s\n" "ldr q26, [x24, x27]\n" "fmla v12.4s, v22.4s, v3.4s\n" - "ldr x18, [%[inptrs], 184]\n" + "ldr x7, [%[inptrs], 184]\n" "fmla v10.4s, v22.4s, v6.4s\n" "ldr x24, [%[inptrs], 96]\n" "fmla v11.4s, v22.4s, v4.4s\n" @@ -3770,7 +3770,7 @@ void Conv::execute_tile( "fmla v24.4s, v27.4s, v7.4s\n" "fmla v23.4s, v27.4s, v9.4s\n" "fmla v1.4s, v26.4s, v3.4s\n" - "ldr q22, [x18, x27]\n" + "ldr q22, [x7, x27]\n" "fmla v21.4s, v26.4s, v6.4s\n" "ldr q19, [x16, x27]\n" "fmla v10.4s, v25.4s, v3.4s\n" @@ -3778,7 +3778,7 @@ void Conv::execute_tile( "fmla v24.4s, v26.4s, v8.4s\n" "ldr q28, [x15, x27]\n" "fmla v20.4s, v25.4s, v4.4s\n" - "ldr x18, [%[inptrs], 144]\n" + "ldr x7, [%[inptrs], 144]\n" "fmla v23.4s, v25.4s, v5.4s\n" "ldr q30, [x16, x27]\n" "fmla v11.4s, v31.4s, v3.4s\n" @@ -3862,7 +3862,7 @@ void Conv::execute_tile( "mov v15.16b, v25.16b\n" "ldr x21, [%[outptrs], 56]\n" "fmla v2.4s, v26.4s, v19.4s\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "str q21, [x21, x28]\n" "ldr x22, [%[outptrs], 80]\n" "ldr q21, [x24, x27]\n" @@ -3886,7 +3886,7 @@ void Conv::execute_tile( "fmla v18.4s, v31.4s, v22.4s\n" "ldr q23, [x17, x27]\n" "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x18, [%[inptrs], 152]\n" + "ldr x7, [%[inptrs], 152]\n" "fmla v16.4s, v28.4s, v19.4s\n" "ldr x24, [%[inptrs], 112]\n" "fmla v13.4s, v28.4s, v22.4s\n" @@ -3904,9 +3904,9 @@ void Conv::execute_tile( "fmla v2.4s, v30.4s, v8.4s\n" "ldr x20, [%[outptrs], 0]\n" "fmla v17.4s, v30.4s, v22.4s\n" - "ldr q29, [x18, x27]\n" + "ldr q29, [x7, x27]\n" "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x18, [%[inptrs], 160]\n" + "ldr x7, [%[inptrs], 160]\n" "fmla v13.4s, v27.4s, v19.4s\n" "ldr x21, [%[outptrs], 32]\n" "fmla v14.4s, v27.4s, v22.4s\n" @@ -3964,11 +3964,11 @@ void Conv::execute_tile( "fmla v17.4s, v24.4s, v7.4s\n" "fmla v21.4s, v24.4s, v22.4s\n" "fmla v15.4s, v24.4s, v9.4s\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "fmla v14.4s, v30.4s, v5.4s\n" "ldr q30, [x24, x27]\n" "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x18, [%[inptrs], 168]\n" + "ldr x7, [%[inptrs], 168]\n" "fmla v17.4s, v23.4s, v8.4s\n" "ldr q31, [x17, x27]\n" "fmla v13.4s, v26.4s, v4.4s\n" @@ -4008,11 +4008,11 @@ void Conv::execute_tile( "fmla v14.4s, v26.4s, v4.4s\n" "ldr x15, [%[inptrs], 216]\n" "fmla v10.4s, v26.4s, v5.4s\n" - "ldr q29, [x18, x27]\n" + "ldr q29, [x7, x27]\n" "fmla v1.4s, v25.4s, v8.4s\n" "ldr q28, [x24, x27]\n" "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x18, [%[inptrs], 176]\n" + "ldr x7, [%[inptrs], 176]\n" "fmla v14.4s, v31.4s, v6.4s\n" "ldr x24, [%[inptrs], 136]\n" "fmla v12.4s, v31.4s, v4.4s\n" @@ -4040,9 +4040,9 @@ void Conv::execute_tile( "fmla v21.4s, v28.4s, v7.4s\n" "fmla v24.4s, v28.4s, v9.4s\n" "fmla v14.4s, v29.4s, v3.4s\n" - "ldr q27, [x18, x27]\n" + "ldr q27, [x7, x27]\n" "fmla v1.4s, v25.4s, v6.4s\n" - "ldr x18, [%[inptrs], 184]\n" + "ldr x7, [%[inptrs], 184]\n" "fmla v10.4s, v29.4s, v4.4s\n" "fmla v20.4s, v29.4s, v5.4s\n" "fmla v21.4s, v25.4s, v8.4s\n" @@ -4058,7 +4058,7 @@ void Conv::execute_tile( "fmla v15.4s, v27.4s, v3.4s\n" "ldr q31, [x15, x27]\n" "fmla v11.4s, v27.4s, v6.4s\n" - "ldr q22, [x18, x27]\n" + "ldr q22, [x7, x27]\n" "fmla v21.4s, v27.4s, v4.4s\n" "ldr x15, [%[inptrs], 232]\n" "fmla v20.4s, v27.4s, v8.4s\n" @@ -4157,14 +4157,14 @@ void Conv::execute_tile( "ldr x25, [%[inptrs], 0]\n" "ldr x17, [%[inptrs], 48]\n" "ldr x24, [%[inptrs], 96]\n" - "ldr x18, [%[inptrs], 144]\n" + "ldr x7, [%[inptrs], 144]\n" "subs x19, x19, #1\n" "ldr s27, [x25, x27]\n" "fmla v2.4s, v27.4s, v22.4s\n" "ldr s26, [x17, x27]\n" "fmla v16.4s, v26.4s, v22.4s\n" "ldr s28, [x24, x27]\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "ldr x25, [%[inptrs], 8]\n" "ldr x17, [%[inptrs], 56]\n" "ldr x24, [%[inptrs], 104]\n" @@ -4186,7 +4186,7 @@ void Conv::execute_tile( "fmla v2.4s, v28.4s, v5.4s\n" "ldr x15, [%[inptrs], 192]\n" "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x18, [%[inptrs], 152]\n" + "ldr x7, [%[inptrs], 152]\n" "fmla v13.4s, v28.4s, v22.4s\n" "ldr s26, [x25, x27]\n" "fmla v18.4s, v29.4s, v19.4s\n" @@ -4202,9 +4202,9 @@ void Conv::execute_tile( "fmla v2.4s, v30.4s, v8.4s\n" "ldr x15, [%[inptrs], 200]\n" "fmla v17.4s, v30.4s, v22.4s\n" - "ldr s29, [x18, x27]\n" + "ldr s29, [x7, x27]\n" "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x18, [%[inptrs], 160]\n" + "ldr x7, [%[inptrs], 160]\n" "fmla v13.4s, v27.4s, v19.4s\n" "ldr x20, [%[outptrs], 0]\n" "fmla v14.4s, v27.4s, v22.4s\n" @@ -4266,9 +4266,9 @@ void Conv::execute_tile( "mov v20.16b, v25.16b\n" "fmla v15.4s, v24.4s, v9.4s\n" "fmla v21.4s, v24.4s, v22.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x18, [%[inptrs], 168]\n" + "ldr x7, [%[inptrs], 168]\n" "fmla v17.4s, v23.4s, v8.4s\n" "ldr s30, [x24, x27]\n" "fmla v13.4s, v26.4s, v4.4s\n" @@ -4310,11 +4310,11 @@ void Conv::execute_tile( "fmla v10.4s, v26.4s, v5.4s\n" "ldr s31, [x15, x27]\n" "fmla v1.4s, v25.4s, v8.4s\n" - "ldr s29, [x18, x27]\n" + "ldr s29, [x7, x27]\n" "fmla v13.4s, v31.4s, v3.4s\n" "ldr x15, [%[inptrs], 216]\n" "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x18, [%[inptrs], 176]\n" + "ldr x7, [%[inptrs], 176]\n" "fmla v12.4s, v31.4s, v4.4s\n" "fmla v10.4s, v31.4s, v7.4s\n" "fmla v11.4s, v31.4s, v5.4s\n" @@ -4346,11 +4346,11 @@ void Conv::execute_tile( "fmla v1.4s, v25.4s, v6.4s\n" "fmla v10.4s, v29.4s, v4.4s\n" "fmla v21.4s, v25.4s, v8.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "fmla v20.4s, v29.4s, v5.4s\n" "ldr s26, [x24, x27]\n" "fmla v12.4s, v22.4s, v3.4s\n" - "ldr x18, [%[inptrs], 184]\n" + "ldr x7, [%[inptrs], 184]\n" "fmla v10.4s, v22.4s, v6.4s\n" "ldr x24, [%[inptrs], 96]\n" "fmla v11.4s, v22.4s, v4.4s\n" @@ -4368,7 +4368,7 @@ void Conv::execute_tile( "fmla v24.4s, v27.4s, v7.4s\n" "fmla v23.4s, v27.4s, v9.4s\n" "fmla v1.4s, v26.4s, v3.4s\n" - "ldr s22, [x18, x27]\n" + "ldr s22, [x7, x27]\n" "fmla v21.4s, v26.4s, v6.4s\n" "ldr s19, [x16, x27]\n" "fmla v10.4s, v25.4s, v3.4s\n" @@ -4376,7 +4376,7 @@ void Conv::execute_tile( "fmla v24.4s, v26.4s, v8.4s\n" "ldr s28, [x15, x27]\n" "fmla v20.4s, v25.4s, v4.4s\n" - "ldr x18, [%[inptrs], 144]\n" + "ldr x7, [%[inptrs], 144]\n" "fmla v23.4s, v25.4s, v5.4s\n" "ldr s30, [x16, x27]\n" "fmla v11.4s, v31.4s, v3.4s\n" @@ -4460,7 +4460,7 @@ void Conv::execute_tile( "mov v15.16b, v25.16b\n" "ldr x21, [%[outptrs], 56]\n" "fmla v2.4s, v26.4s, v19.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "str s21, [x21, x28]\n" "ldr x22, [%[outptrs], 80]\n" "ldr s21, [x24, x27]\n" @@ -4484,7 +4484,7 @@ void Conv::execute_tile( "fmla v18.4s, v31.4s, v22.4s\n" "ldr s23, [x17, x27]\n" "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x18, [%[inptrs], 152]\n" + "ldr x7, [%[inptrs], 152]\n" "fmla v16.4s, v28.4s, v19.4s\n" "ldr x24, [%[inptrs], 112]\n" "fmla v13.4s, v28.4s, v22.4s\n" @@ -4502,9 +4502,9 @@ void Conv::execute_tile( "fmla v2.4s, v30.4s, v8.4s\n" "ldr x20, [%[outptrs], 0]\n" "fmla v17.4s, v30.4s, v22.4s\n" - "ldr s29, [x18, x27]\n" + "ldr s29, [x7, x27]\n" "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x18, [%[inptrs], 160]\n" + "ldr x7, [%[inptrs], 160]\n" "fmla v13.4s, v27.4s, v19.4s\n" "ldr x21, [%[outptrs], 32]\n" "fmla v14.4s, v27.4s, v22.4s\n" @@ -4562,11 +4562,11 @@ void Conv::execute_tile( "fmla v17.4s, v24.4s, v7.4s\n" "fmla v21.4s, v24.4s, v22.4s\n" "fmla v15.4s, v24.4s, v9.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "fmla v14.4s, v30.4s, v5.4s\n" "ldr s30, [x24, x27]\n" "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x18, [%[inptrs], 168]\n" + "ldr x7, [%[inptrs], 168]\n" "fmla v17.4s, v23.4s, v8.4s\n" "ldr s31, [x17, x27]\n" "fmla v13.4s, v26.4s, v4.4s\n" @@ -4606,11 +4606,11 @@ void Conv::execute_tile( "fmla v14.4s, v26.4s, v4.4s\n" "ldr x15, [%[inptrs], 216]\n" "fmla v10.4s, v26.4s, v5.4s\n" - "ldr s29, [x18, x27]\n" + "ldr s29, [x7, x27]\n" "fmla v1.4s, v25.4s, v8.4s\n" "ldr s28, [x24, x27]\n" "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x18, [%[inptrs], 176]\n" + "ldr x7, [%[inptrs], 176]\n" "fmla v14.4s, v31.4s, v6.4s\n" "ldr x24, [%[inptrs], 136]\n" "fmla v12.4s, v31.4s, v4.4s\n" @@ -4638,9 +4638,9 @@ void Conv::execute_tile( "fmla v21.4s, v28.4s, v7.4s\n" "fmla v24.4s, v28.4s, v9.4s\n" "fmla v14.4s, v29.4s, v3.4s\n" - "ldr s27, [x18, x27]\n" + "ldr s27, [x7, x27]\n" "fmla v1.4s, v25.4s, v6.4s\n" - "ldr x18, [%[inptrs], 184]\n" + "ldr x7, [%[inptrs], 184]\n" "fmla v10.4s, v29.4s, v4.4s\n" "fmla v20.4s, v29.4s, v5.4s\n" "fmla v21.4s, v25.4s, v8.4s\n" @@ -4656,7 +4656,7 @@ void Conv::execute_tile( "fmla v15.4s, v27.4s, v3.4s\n" "ldr s31, [x15, x27]\n" "fmla v11.4s, v27.4s, v6.4s\n" - "ldr s22, [x18, x27]\n" + "ldr s22, [x7, x27]\n" "fmla v21.4s, v27.4s, v4.4s\n" "ldr x15, [%[inptrs], 232]\n" "fmla v20.4s, v27.4s, v8.4s\n" @@ -4734,7 +4734,7 @@ void Conv::execute_tile( "7:\n" : [wbptr] "+r" (weight_bias_ptr) : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" ); } @@ -4762,7 +4762,7 @@ void Conv::execute_tile( "add x21, x19, #64\n" "add x17, x19, %[input_col_stride1]\n" "add x22, x20, %[input_row_stride]\n" - "add x18, x17, #64\n" + "add x7, x17, #64\n" "add x11, x17, %[input_col_stride1]\n" "add x23, x22, %[input_row_stride]\n" "add x12, x11, #64\n" @@ -4844,7 +4844,7 @@ void Conv::execute_tile( "fmla v14.4s, v27.4s, v20.4s\n" "ldr q26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" @@ -4856,7 +4856,7 @@ void Conv::execute_tile( "fmla v15.4s, v22.4s, v20.4s\n" "ldr q30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x18]\n" + "prfm pldl1keep, [x24, x7]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" @@ -4868,7 +4868,7 @@ void Conv::execute_tile( "fmla v17.4s, v21.4s, v20.4s\n" "ldr q22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" @@ -4880,19 +4880,19 @@ void Conv::execute_tile( "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x7]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x7]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr q26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x7]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" @@ -5151,7 +5151,7 @@ void Conv::execute_tile( "fmla v14.4s, v27.4s, v20.4s\n" "ldr q26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" @@ -5163,7 +5163,7 @@ void Conv::execute_tile( "fmla v15.4s, v22.4s, v20.4s\n" "ldr q30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x18]\n" + "prfm pldl1keep, [x24, x7]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" @@ -5175,7 +5175,7 @@ void Conv::execute_tile( "fmla v17.4s, v21.4s, v20.4s\n" "ldr q22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" @@ -5187,19 +5187,19 @@ void Conv::execute_tile( "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x7]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x7]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr q26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x7]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" @@ -5460,7 +5460,7 @@ void Conv::execute_tile( "fmla v14.4s, v27.4s, v20.4s\n" "ldr s26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" @@ -5472,7 +5472,7 @@ void Conv::execute_tile( "fmla v15.4s, v22.4s, v20.4s\n" "ldr s30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x18]\n" + "prfm pldl1keep, [x24, x7]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" @@ -5484,7 +5484,7 @@ void Conv::execute_tile( "fmla v17.4s, v21.4s, v20.4s\n" "ldr s22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" @@ -5496,19 +5496,19 @@ void Conv::execute_tile( "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x7]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x7]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr s26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x7]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" @@ -5767,7 +5767,7 @@ void Conv::execute_tile( "fmla v14.4s, v27.4s, v20.4s\n" "ldr s26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x18]\n" + "prfm pldl1keep, [%[inptr0], x7]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" @@ -5779,7 +5779,7 @@ void Conv::execute_tile( "fmla v15.4s, v22.4s, v20.4s\n" "ldr s30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x18]\n" + "prfm pldl1keep, [x24, x7]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" @@ -5791,7 +5791,7 @@ void Conv::execute_tile( "fmla v17.4s, v21.4s, v20.4s\n" "ldr s22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x18]\n" + "prfm pldl1keep, [x9, x7]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" @@ -5803,19 +5803,19 @@ void Conv::execute_tile( "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x18]\n" + "prfm pldl1keep, [x20, x7]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x18]\n" + "prfm pldl1keep, [x22, x7]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr s26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x18]\n" + "prfm pldl1keep, [x23, x7]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" @@ -6007,7 +6007,7 @@ void Conv::execute_tile( "7:\n" : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" ); } -- cgit v1.2.1