From 74921eee924625426429044decefe3673561b174 Mon Sep 17 00:00:00 2001 From: Michael Tyler Date: Wed, 12 Apr 2023 17:43:17 +0100 Subject: Update CPU kernel implementations and guard directives Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../transforms/a64_transpose_interleave_128.hpp | 7 +- .../transforms/a64_transpose_interleave_12_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_12_1x8.hpp | 6 +- .../transforms/a64_transpose_interleave_12_2x2.hpp | 7 +- .../transforms/a64_transpose_interleave_12_2x4.hpp | 7 +- .../a64_transpose_interleave_12_2x4_fp32bf16.hpp | 6 +- .../a64_transpose_interleave_12_s8s16.hpp | 7 +- .../a64_transpose_interleave_12_u8u16.hpp | 7 +- .../transforms/a64_transpose_interleave_16.hpp | 5 +- .../transforms/a64_transpose_interleave_16_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_16_1x8.hpp | 6 +- .../transforms/a64_transpose_interleave_16_2x2.hpp | 7 +- .../transforms/a64_transpose_interleave_16_2x4.hpp | 7 +- .../a64_transpose_interleave_16_2x4_fp32bf16.hpp | 7 +- .../transforms/a64_transpose_interleave_24.hpp | 6 +- .../a64_transpose_interleave_24_2x4_fp32bf16.hpp | 6 +- .../a64_transpose_interleave_24_bf16fp32.hpp | 7 +- .../a64_transpose_interleave_24_fp16fp32.hpp | 6 +- .../transforms/a64_transpose_interleave_32_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_32_2x2.hpp | 7 +- .../transforms/a64_transpose_interleave_48.hpp | 6 +- .../transforms/a64_transpose_interleave_4_1x16.hpp | 6 +- .../transforms/a64_transpose_interleave_4_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_64.hpp | 6 +- .../transforms/a64_transpose_interleave_96.hpp | 6 +- .../transforms/sme_transpose_interleave_16VL.hpp | 6 +- .../sme_transpose_interleave_16VL_1x4.hpp | 6 +- .../sme_transpose_interleave_16VL_2x2.hpp | 6 +- .../sme_transpose_interleave_16VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_1VL.hpp | 6 +- .../sme_transpose_interleave_1VL_1x4.hpp | 6 +- .../sme_transpose_interleave_1VL_2x2.hpp | 5 +- .../sme_transpose_interleave_1VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_2VL.hpp | 6 +- .../sme_transpose_interleave_2VL_1x4.hpp | 6 +- .../sme_transpose_interleave_2VL_2x2.hpp | 6 +- .../sme_transpose_interleave_2VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_4VL.hpp | 6 +- .../sme_transpose_interleave_4VL_1x4.hpp | 6 +- .../sme_transpose_interleave_4VL_2x2.hpp | 6 +- .../sme_transpose_interleave_4VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_8VL.hpp | 208 +++++++++++++++++++++ .../sme_transpose_interleave_8VL_1x4.hpp | 143 ++++++++++++++ .../sme_transpose_interleave_8VL_2x2.hpp | 132 +++++++++++++ .../sve_transpose_interleave_12VL_2x4_fp32bf16.hpp | 6 +- .../transforms/sve_transpose_interleave_1VL.hpp | 8 +- .../sve_transpose_interleave_1VL_1x4.hpp | 6 +- .../transforms/sve_transpose_interleave_3VL.hpp | 8 +- .../sve_transpose_interleave_3VL_1x4.hpp | 7 +- .../sve_transpose_interleave_3VL_2x2.hpp | 7 +- .../transforms/sve_transpose_interleave_4VL.hpp | 8 +- .../sve_transpose_interleave_4VL_1x4.hpp | 6 +- .../sve_transpose_interleave_4VL_2x2.hpp | 8 +- .../sve_transpose_interleave_6VL_1x8.hpp | 6 +- .../sve_transpose_interleave_6VL_2x4.hpp | 8 +- .../sve_transpose_interleave_6VL_2x4_fp32bf16.hpp | 6 +- .../sve_transpose_interleave_6VL_4x2.hpp | 7 +- .../transforms/sve_transpose_interleave_8VL.hpp | 7 +- .../sve_transpose_interleave_8VL_1x4.hpp | 6 +- .../sve_transpose_interleave_8VL_1x8.hpp | 6 +- .../sve_transpose_interleave_8VL_2x2.hpp | 7 +- .../sve_transpose_interleave_8VL_2x4.hpp | 7 +- .../sve_transpose_interleave_8VL_2x4_fp32bf16.hpp | 6 +- 63 files changed, 662 insertions(+), 201 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp (limited to 'src/core/NEON/kernels/arm_gemm/transforms') diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp index e6186984e8..8574d89226 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -193,7 +193,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -264,7 +263,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt "add %x[out], %x[out], #0x80\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -286,4 +284,5 @@ void Transform<32, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp index 6d97f71c7d..cdf1f98608 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -427,4 +427,5 @@ void Transform<12, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp index 96d132b74f..da0809d4d6 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -39,7 +39,6 @@ void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t wid size_t out_stride = 12 * roundup(height, 8) * sizeof(uint8_t); __asm__ __volatile__( - "1:" // Main row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -332,4 +331,5 @@ void Transform<12, 8, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp index 04af6fd713..cef468e9cc 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -236,7 +236,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x9, %x[in]\n" "mov x20, %x[width]\n" @@ -319,7 +318,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x30\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -341,4 +339,5 @@ void Transform<12, 2, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp index e6ddc10e04..4c02d0534d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -276,7 +276,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -420,7 +419,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x60\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -442,4 +440,5 @@ void Transform<12, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp index e487d4d839..2a3208d18d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -710,7 +710,6 @@ void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, si "add %x[out], %x[out], #0x60\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -731,4 +730,5 @@ void Transform<12, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp index 7938325fa4..4d9d5e7f43 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -182,7 +182,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -251,7 +250,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi "add %x[out], %x[out], #0x18\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp index 4c66fb2c2f..b0cd7e4ef7 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -182,7 +182,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -251,7 +250,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t "add %x[out], %x[out], #0x18\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp index f06c167361..0399f8becc 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -137,4 +137,5 @@ void Transform<4, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp index e0ccb368c2..f3a1dde73f 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -327,4 +327,5 @@ void Transform<16, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp index fa45f4fd4d..7c7e91e666 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -39,7 +39,6 @@ void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t wid size_t out_stride = 16 * roundup(height, 8) * sizeof(uint8_t); __asm__ __volatile__( - "1:" // Main row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -288,4 +287,5 @@ void Transform<16, 8, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp index 06efa9781e..b4515cbfd4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -163,7 +163,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 16f\n" "8:" // Main loop skip - "9:" // Tail row loop: Head "mov x9, %x[in]\n" "mov x20, %x[width]\n" @@ -221,7 +220,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x40\n" "bge 9b\n" "16:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -243,4 +241,5 @@ void Transform<16, 2, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp index dafa53eec3..ac67467240 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -320,7 +320,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -486,7 +485,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x80\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -508,4 +506,5 @@ void Transform<16, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp index e012d0920f..b9fe8b126a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -281,7 +281,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si "bge 1b\n" "cbz %x[height], 16f\n" "8:" // Main loop skip - "9:" // Tail row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -423,7 +422,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si "add %x[out], %x[out], #0x80\n" "bge 9b\n" "16:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -444,4 +442,5 @@ void Transform<16, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp index 20f9d39f4e..46211ad4e4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -158,7 +158,6 @@ void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -268,4 +267,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp index 22d68acd51..1cb7bc4445 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -762,7 +762,6 @@ void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, si "add %x[out], %x[out], #0xc0\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -783,4 +782,5 @@ void Transform<24, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp index 799a9cd91d..dcaf69d2a8 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -198,7 +198,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -271,7 +270,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t "add %x[out], %x[out], #0x30\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -292,4 +290,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp index 621c5f99ff..966b75664e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -270,7 +270,6 @@ void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t w "add %x[out], %x[out], #0x30\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -291,4 +290,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp index 5cd7bd0512..4a22675028 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -503,4 +503,5 @@ void Transform<32, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp index 706d7cd359..237536697c 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -280,7 +280,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 24f\n" "12:" // Main loop skip - "13:" // Tail row loop: Head "mov x25, %x[in]\n" "mov x20, %x[width]\n" @@ -427,7 +426,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x80\n" "bge 13b\n" "24:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -449,4 +447,5 @@ void Transform<32, 2, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp index b4827525cd..f35752d5a8 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -137,7 +137,6 @@ void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -241,4 +240,5 @@ void Transform<6, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp index e1ab14e594..6ef02ac044 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -39,7 +39,6 @@ void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t wid size_t out_stride = 4 * roundup(height, 16) * sizeof(uint8_t); __asm__ __volatile__( - "1:" // Main row loop: Head "mov x17, %x[in]\n" "add x16, x17, %x[in_stride]\n" @@ -316,4 +315,5 @@ void Transform<4, 16, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp index 8adc69e8b3..5667820865 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -333,4 +333,5 @@ void Transform<4, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp index 07602bdc8d..328274a488 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -145,7 +145,6 @@ void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -251,4 +250,5 @@ void Transform<32, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp index a048fbb109..feb469ab0e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -177,7 +177,6 @@ void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -265,4 +264,5 @@ void Transform<24, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp index 01921c5ad9..a4d480c405 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -140,4 +139,5 @@ void Transform<16, 1, true, VLType::SME>( ); } -#endif // __ARM_FEATURE_SVE + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp index 6b9b471fdc..552abfc1c6 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -178,4 +177,5 @@ void Transform<16, 4, true, VLType::SME>( ); } -#endif // __ARM_FEATURE_SVE + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp index 96128cf9c2..9c6f5c83a1 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -150,4 +149,5 @@ void Transform<16, 2, true, VLType::SME>( ); } -#endif // __ARM_FEATURE_SVE + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp index 080db1c5c1..2756327815 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -197,4 +196,5 @@ void Transform<16, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp index 7e496095f4..a6ddb8fec0 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -210,4 +209,5 @@ void Transform<1, 1, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp index 45d3c0729e..399a52e233 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -146,4 +145,5 @@ void Transform<1, 4, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp index 7120d1d33e..6318e29a79 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -206,4 +205,4 @@ void Transform<1, 2, true, VLType::SME>( ); } -#endif +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp index 72e7b0c99a..b90063028d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -219,4 +218,5 @@ void Transform<1, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp index a057fd514e..f827197ab7 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -222,4 +221,5 @@ void Transform<2, 1, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp index 9eb4075677..c471d66e17 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -146,4 +145,5 @@ void Transform<2, 4, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp index 3fc3920500..5f967fa615 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -208,4 +207,5 @@ void Transform<2, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp index 9d402a2d58..f22b833821 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -236,4 +235,5 @@ void Transform<2, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp index 362bebbea0..14636e3218 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -185,4 +184,5 @@ void Transform<4, 1, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp index cbcc0b4c8b..2d46a481f3 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -121,4 +120,5 @@ void Transform<4, 4, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp index 9b28578217..002a12479a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -168,4 +167,5 @@ void Transform<4, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp index 8873070019..2a43f34f71 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -186,4 +185,5 @@ void Transform<4, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp new file mode 100644 index 0000000000..be9ad666a9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SME) + +namespace { + +void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 8 * height * sme::get_vector_length(); + + __asm__ __volatile__( + ".inst 0xd503477f // SMSTART ZA\n" + "cmp %x[height], #0x2\n" + "ptrue p7.b\n" + "blt 4f\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x2\n" + "mov x22, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x21, x22\n" + "whilelt p0.h, XZR, x21\n" + "ld1h { z31.h }, p0/Z, [x25]\n" + "dech x21\n" + "whilelt p6.h, XZR, x21\n" + "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n" + "dech x21\n" + "whilelt p5.h, XZR, x21\n" + "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n" + "dech x21\n" + "whilelt p4.h, XZR, x21\n" + "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n" + "dech x21\n" + "whilelt p3.h, XZR, x21\n" + "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n" + "dech x21\n" + "whilelt p2.h, XZR, x21\n" + "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n" + "dech x21\n" + "whilelt p1.h, XZR, x21\n" + "ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n" + "dech x21\n" + "mov x20, x23\n" + "ld1h { z24.h }, p0/Z, [x24]\n" + "whilelt p0.h, XZR, x21\n" + "dech x22, ALL, MUL #8\n" + "ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n" + "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n" + "cmp x22, #0x0\n" + "addvl x25, x25, #8\n" + "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n" + "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n" + "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n" + "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n" + "st1h { z31.h }, p7, [x20]\n" + "addvl x24, x24, #8\n" + "st1h { z30.h }, p7, [x20, #1, MUL VL]\n" + "st1h { z29.h }, p7, [x20, #2, MUL VL]\n" + "st1h { z28.h }, p7, [x20, #3, MUL VL]\n" + "st1h { z27.h }, p7, [x20, #4, MUL VL]\n" + "st1h { z26.h }, p7, [x20, #5, MUL VL]\n" + "st1h { z25.h }, p7, [x20, #6, MUL VL]\n" + "st1h { z23.h }, p7, [x20, #7, MUL VL]\n" + "addvl x20, x20, #16\n" + "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n" + "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n" + "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n" + "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n" + "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n" + "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n" + "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n" + "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "cmp %x[height], #0x2\n" + "addvl %x[out], %x[out], #16\n" + "bge 1b\n" + "cbz %x[height], 8f\n" + "4:" // Main loop skip + "5:" // Tail row loop: Head + "mov x25, %x[in]\n" + "add %x[in], x25, %x[in_stride]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x21, %x[width]\n" + "6:" // Tail row loop: Column loop + "mov x20, x21\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z23.h }, p0/Z, [x25]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n" + "dech x20\n" + "dech x21, ALL, MUL #8\n" + "whilelt p0.h, XZR, x20\n" + "cmp x21, #0x0\n" + "ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n" + "st1h { z23.h }, p7, [x23]\n" + "addvl x25, x25, #8\n" + "st1h { z22.h }, p7, [x23, #1, MUL VL]\n" + "st1h { z21.h }, p7, [x23, #2, MUL VL]\n" + "st1h { z20.h }, p7, [x23, #3, MUL VL]\n" + "st1h { z19.h }, p7, [x23, #4, MUL VL]\n" + "st1h { z18.h }, p7, [x23, #5, MUL VL]\n" + "st1h { z17.h }, p7, [x23, #6, MUL VL]\n" + "st1h { z16.h }, p7, [x23, #7, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "bgt 6b\n" + "7:" // Tail row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 5b\n" + "8:" // Done + ".inst 0xd503467f // SMSTOP\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 1, true, VLType::SME>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<8, 1, true, VLType::SME>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +template<> +void Transform<8, 1, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp new file mode 100644 index 0000000000..45d2e24258 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SME) + +namespace { + +void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 8 * roundup(height, 4) * sme::get_vector_length(); + + __asm__ __volatile__( + ".inst 0xd503477f // SMSTART ZA\n" + "ptrue p2.b\n" + "1:" // Main row loop: Head + "mov x26, %x[in]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "add %x[in], x23, %x[in_stride]\n" + "csel x23, x23, %x[pad_row], GT\n" + "csel x24, x24, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "mov x22, %x[out]\n" + "csel x25, x25, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x21, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x20, x21\n" + "whilelt p1.b, XZR, x20\n" + "ld1b { z19.b }, p1/Z, [x26]\n" + "decb x20\n" + "whilelt p0.b, XZR, x20\n" + "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n" + "ld1b { z18.b }, p1/Z, [x25]\n" + "decw x21, ALL, MUL #8\n" + "cmp x21, #0x0\n" + "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "addvl x25, x25, #2\n" + "ld1b { z16.b }, p1/Z, [x24]\n" + "zip1 z24.b, z19.b, z16.b\n" + "zip2 z20.b, z19.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n" + "zip1 z23.b, z17.b, z16.b\n" + "zip2 z22.b, z17.b, z16.b\n" + "addvl x24, x24, #2\n" + "ld1b { z16.b }, p1/Z, [x23]\n" + "zip1 z17.b, z18.b, z16.b\n" + "zip2 z19.b, z18.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n" + "zip1 z18.b, z21.b, z16.b\n" + "zip2 z21.b, z21.b, z16.b\n" + "addvl x23, x23, #2\n" + "zip1 z16.b, z24.b, z17.b\n" + "zip2 z17.b, z24.b, z17.b\n" + "st1b { z16.b }, p2, [x22]\n" + "zip1 z16.b, z20.b, z19.b\n" + "zip2 z20.b, z20.b, z19.b\n" + "st1b { z17.b }, p2, [x22, #1, MUL VL]\n" + "zip1 z19.b, z23.b, z18.b\n" + "zip2 z18.b, z23.b, z18.b\n" + "st1b { z16.b }, p2, [x22, #2, MUL VL]\n" + "zip1 z17.b, z22.b, z21.b\n" + "zip2 z16.b, z22.b, z21.b\n" + "st1b { z20.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z19.b }, p2, [x22, #4, MUL VL]\n" + "st1b { z18.b }, p2, [x22, #5, MUL VL]\n" + "st1b { z17.b }, p2, [x22, #6, MUL VL]\n" + "st1b { z16.b }, p2, [x22, #7, MUL VL]\n" + "add x22, x22, %x[out_stride]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 1b\n" + ".inst 0xd503467f // SMSTOP\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 4, true, VLType::SME>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<8, 4, true, VLType::SME>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp new file mode 100644 index 0000000000..ec7c415e27 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SME) + +namespace { + +void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 8 * roundup(height, 2) * sme::get_vector_length(); + + __asm__ __volatile__( + ".inst 0xd503477f // SMSTART ZA\n" + "ptrue p4.b\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "add x23, x24, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "add %x[in], x23, %x[in_stride]\n" + "mov x22, %x[out]\n" + "csel x23, x23, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x21, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x20, x21\n" + "whilelt p3.h, XZR, x20\n" + "ld1h { z20.h }, p3/Z, [x24]\n" + "dech x20\n" + "whilelt p2.h, XZR, x20\n" + "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n" + "dech x20\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z17.h }, p3/Z, [x23]\n" + "decw x21, ALL, MUL #8\n" + "cmp x21, #0x0\n" + "zip1 z23.h, z20.h, z17.h\n" + "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n" + "addvl x24, x24, #4\n" + "zip2 z22.h, z20.h, z17.h\n" + "zip1 z21.h, z19.h, z16.h\n" + "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n" + "zip2 z20.h, z19.h, z16.h\n" + "zip1 z19.h, z18.h, z17.h\n" + "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "zip2 z18.h, z18.h, z17.h\n" + "zip1 z17.h, z24.h, z16.h\n" + "zip2 z16.h, z24.h, z16.h\n" + "st1h { z23.h }, p4, [x22]\n" + "st1h { z22.h }, p4, [x22, #1, MUL VL]\n" + "st1h { z21.h }, p4, [x22, #2, MUL VL]\n" + "st1h { z20.h }, p4, [x22, #3, MUL VL]\n" + "st1h { z19.h }, p4, [x22, #4, MUL VL]\n" + "st1h { z18.h }, p4, [x22, #5, MUL VL]\n" + "st1h { z17.h }, p4, [x22, #6, MUL VL]\n" + "st1h { z16.h }, p4, [x22, #7, MUL VL]\n" + "add x22, x22, %x[out_stride]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 1b\n" + ".inst 0xd503467f // SMSTOP\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 2, true, VLType::SME>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +template<> +void Transform<8, 2, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp index 847718992a..f627fe575f 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -373,4 +372,5 @@ void Transform<12, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp index 74fce4ddf9..b33c4f6c2d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -101,7 +100,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt "bge 1b\n" "cbz %x[height], 12f\n" "6:" // Main loop skip - "7:" // Tail row loop: Head "mov x21, %x[width]\n" "cntw x20, ALL, MUL #2\n" @@ -138,7 +136,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt "addvl %x[out], %x[out], #1\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23" @@ -160,4 +157,5 @@ void Transform<1, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp index a034be5e74..e468787815 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -305,4 +304,5 @@ void Transform<1, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp index 82d4184061..546800fa69 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -93,7 +92,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt "bge 1b\n" "cbz %x[height], 8f\n" "4:" // Main loop skip - "5:" // Tail row loop: Head "mov x26, %x[in]\n" "add %x[in], x26, %x[in_stride]\n" @@ -123,7 +121,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt "addvl %x[out], %x[out], #3\n" "bge 5b\n" "8:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27" @@ -171,4 +168,5 @@ void Transform<3, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp index ec7095db7b..a44141c109 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -329,7 +328,6 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi "addvl %x[out], %x[out], #3\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -364,4 +362,5 @@ void Transform<3, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp index 3d14383a64..36a15a16b3 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -292,7 +291,6 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #3\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -314,4 +312,5 @@ void Transform<3, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp index a39235187f..e661e2698a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -103,7 +102,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt "bge 1b\n" "cbz %x[height], 8f\n" "4:" // Main loop skip - "5:" // Tail row loop: Head "mov x26, %x[in]\n" "add %x[in], x26, %x[in_stride]\n" @@ -137,7 +135,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt "addvl %x[out], %x[out], #4\n" "bge 5b\n" "8:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -185,4 +182,5 @@ void Transform<4, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp index e3489398d4..03a78f72f1 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -317,4 +316,5 @@ void Transform<4, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp index 9505dc5e6d..b196799cfe 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -248,7 +247,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t "bge 1b\n" "cbz %x[height], 12f\n" "6:" // Main loop skip - "7:" // Tail row loop: Head "mov x12, %x[in]\n" "mov x21, %x[width]\n" @@ -323,7 +321,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #4\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -345,4 +342,5 @@ void Transform<4, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp index 982c0545ed..68fe2d0cbe 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -292,4 +291,5 @@ void Transform<6, 8, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp index 2b5741a49c..910fc6cb02 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -260,7 +259,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t "bge 1b\n" "cbz %x[height], 12f\n" "6:" // Main loop skip - "7:" // Tail row loop: Head "mov x12, %x[in]\n" "add x11, x12, %x[in_stride]\n" @@ -386,7 +384,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #6\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -408,4 +405,5 @@ void Transform<6, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp index 146da33869..f0f10d2f43 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -235,4 +234,5 @@ void Transform<6, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp index f6fc5e8b84..c638eaacde 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -296,7 +295,6 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t "addvl %x[out], %x[out], #6\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -318,4 +316,5 @@ void Transform<6, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp index 07147acd8e..0526bd0596 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -281,7 +280,6 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt "addvl %x[out], %x[out], #8\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -303,4 +301,5 @@ void Transform<8, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp index 3ba50fee60..98f0770d77 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -283,4 +282,5 @@ void Transform<8, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp index 6b5ca38ab1..3fa5292143 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -256,4 +255,5 @@ void Transform<8, 8, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp index 237e9b684f..02977ecf1e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -354,7 +353,6 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #8\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -376,4 +374,5 @@ void Transform<8, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp index 51cae7dd5a..34799c60a6 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -439,7 +438,6 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #8\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -461,4 +459,5 @@ void Transform<8, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp index 4ad882870e..5a48e579ae 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -279,4 +278,5 @@ void Transform<8, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) -- cgit v1.2.1