aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/transforms
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/transforms')
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp208
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp143
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp132
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp6
63 files changed, 662 insertions, 201 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
index e6186984e8..8574d89226 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -193,7 +193,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -264,7 +263,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt
"add %x[out], %x[out], #0x80\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -286,4 +284,5 @@ void Transform<32, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
index 6d97f71c7d..cdf1f98608 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -427,4 +427,5 @@ void Transform<12, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
index 96d132b74f..da0809d4d6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -39,7 +39,6 @@ void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t wid
size_t out_stride = 12 * roundup<size_t>(height, 8) * sizeof(uint8_t);
__asm__ __volatile__(
-
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -332,4 +331,5 @@ void Transform<12, 8, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
index 04af6fd713..cef468e9cc 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -236,7 +236,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"mov x20, %x[width]\n"
@@ -319,7 +318,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x30\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -341,4 +339,5 @@ void Transform<12, 2, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
index e6ddc10e04..4c02d0534d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -276,7 +276,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -420,7 +419,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x60\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -442,4 +440,5 @@ void Transform<12, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
index e487d4d839..2a3208d18d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -710,7 +710,6 @@ void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, si
"add %x[out], %x[out], #0x60\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -731,4 +730,5 @@ void Transform<12, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
index 7938325fa4..4d9d5e7f43 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -182,7 +182,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -251,7 +250,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi
"add %x[out], %x[out], #0x18\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
index 4c66fb2c2f..b0cd7e4ef7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -182,7 +182,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -251,7 +250,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t
"add %x[out], %x[out], #0x18\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
index f06c167361..0399f8becc 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -137,4 +137,5 @@ void Transform<4, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
index e0ccb368c2..f3a1dde73f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -327,4 +327,5 @@ void Transform<16, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
index fa45f4fd4d..7c7e91e666 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -39,7 +39,6 @@ void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t wid
size_t out_stride = 16 * roundup<size_t>(height, 8) * sizeof(uint8_t);
__asm__ __volatile__(
-
"1:" // Main row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -288,4 +287,5 @@ void Transform<16, 8, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
index 06efa9781e..b4515cbfd4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -163,7 +163,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 16f\n"
"8:" // Main loop skip
-
"9:" // Tail row loop: Head
"mov x9, %x[in]\n"
"mov x20, %x[width]\n"
@@ -221,7 +220,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x40\n"
"bge 9b\n"
"16:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -243,4 +241,5 @@ void Transform<16, 2, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
index dafa53eec3..ac67467240 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -320,7 +320,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -486,7 +485,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x80\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -508,4 +506,5 @@ void Transform<16, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
index e012d0920f..b9fe8b126a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -281,7 +281,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si
"bge 1b\n"
"cbz %x[height], 16f\n"
"8:" // Main loop skip
-
"9:" // Tail row loop: Head
"mov x9, %x[in]\n"
"add x28, x9, %x[in_stride]\n"
@@ -423,7 +422,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si
"add %x[out], %x[out], #0x80\n"
"bge 9b\n"
"16:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -444,4 +442,5 @@ void Transform<16, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
index 20f9d39f4e..46211ad4e4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -158,7 +158,6 @@ void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -268,4 +267,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
index 22d68acd51..1cb7bc4445 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -762,7 +762,6 @@ void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, si
"add %x[out], %x[out], #0xc0\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
@@ -783,4 +782,5 @@ void Transform<24, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
index 799a9cd91d..dcaf69d2a8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -198,7 +198,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -271,7 +270,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t
"add %x[out], %x[out], #0x30\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -292,4 +290,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
index 621c5f99ff..966b75664e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -270,7 +270,6 @@ void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t w
"add %x[out], %x[out], #0x30\n"
"bge 11b\n"
"20:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -291,4 +290,5 @@ void Transform<12, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
index 5cd7bd0512..4a22675028 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -503,4 +503,5 @@ void Transform<32, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
index 706d7cd359..237536697c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -280,7 +280,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w
"bge 1b\n"
"cbz %x[height], 24f\n"
"12:" // Main loop skip
-
"13:" // Tail row loop: Head
"mov x25, %x[in]\n"
"mov x20, %x[width]\n"
@@ -427,7 +426,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w
"add %x[out], %x[out], #0x80\n"
"bge 13b\n"
"24:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
@@ -449,4 +447,5 @@ void Transform<32, 2, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
index b4827525cd..f35752d5a8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -137,7 +137,6 @@ void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -241,4 +240,5 @@ void Transform<6, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
index e1ab14e594..6ef02ac044 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -39,7 +39,6 @@ void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t wid
size_t out_stride = 4 * roundup<size_t>(height, 16) * sizeof(uint8_t);
__asm__ __volatile__(
-
"1:" // Main row loop: Head
"mov x17, %x[in]\n"
"add x16, x17, %x[in_stride]\n"
@@ -316,4 +315,5 @@ void Transform<4, 16, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
index 8adc69e8b3..5667820865 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -333,4 +333,5 @@ void Transform<4, 4, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
index 07602bdc8d..328274a488 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -145,7 +145,6 @@ void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -251,4 +250,5 @@ void Transform<32, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
index a048fbb109..feb469ab0e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
@@ -24,7 +24,7 @@
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__)
namespace {
@@ -177,7 +177,6 @@ void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width
"bge 1b\n"
"cbz %x[height], 20f\n"
"10:" // Main loop skip
-
"11:" // Tail row loop: Head
"mov x20, %x[width]\n"
"mov x25, %x[in]\n"
@@ -265,4 +264,5 @@ void Transform<24, 1, true, VLType::None>(
);
}
-#endif
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
index 01921c5ad9..a4d480c405 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -140,4 +139,5 @@ void Transform<16, 1, true, VLType::SME>(
);
}
-#endif // __ARM_FEATURE_SVE
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
index 6b9b471fdc..552abfc1c6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -178,4 +177,5 @@ void Transform<16, 4, true, VLType::SME>(
);
}
-#endif // __ARM_FEATURE_SVE
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
index 96128cf9c2..9c6f5c83a1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -150,4 +149,5 @@ void Transform<16, 2, true, VLType::SME>(
);
}
-#endif // __ARM_FEATURE_SVE
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
index 080db1c5c1..2756327815 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -197,4 +196,5 @@ void Transform<16, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
index 7e496095f4..a6ddb8fec0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -210,4 +209,5 @@ void Transform<1, 1, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
index 45d3c0729e..399a52e233 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -146,4 +145,5 @@ void Transform<1, 4, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
index 7120d1d33e..6318e29a79 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -206,4 +205,4 @@ void Transform<1, 2, true, VLType::SME>(
);
}
-#endif
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
index 72e7b0c99a..b90063028d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -219,4 +218,5 @@ void Transform<1, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
index a057fd514e..f827197ab7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -222,4 +221,5 @@ void Transform<2, 1, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
index 9eb4075677..c471d66e17 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -146,4 +145,5 @@ void Transform<2, 4, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
index 3fc3920500..5f967fa615 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -208,4 +207,5 @@ void Transform<2, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
index 9d402a2d58..f22b833821 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -236,4 +235,5 @@ void Transform<2, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
index 362bebbea0..14636e3218 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -185,4 +184,5 @@ void Transform<4, 1, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
index cbcc0b4c8b..2d46a481f3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -121,4 +120,5 @@ void Transform<4, 4, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
index 9b28578217..002a12479a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -168,4 +167,5 @@ void Transform<4, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
index 8873070019..2a43f34f71 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SME)
namespace {
@@ -186,4 +185,5 @@ void Transform<4, 2, true, VLType::SME>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
new file mode 100644
index 0000000000..be9ad666a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+ size_t out_stride = 8 * height * sme::get_vector_length<uint8_t>();
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cmp %x[height], #0x2\n"
+ "ptrue p7.b\n"
+ "blt 4f\n"
+ "1:" // Main row loop: Head
+ "mov x25, %x[in]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add %x[in], x24, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "mov x22, %x[width]\n"
+ "2:" // Main row loop: Column loop
+ "mov x21, x22\n"
+ "whilelt p0.h, XZR, x21\n"
+ "ld1h { z31.h }, p0/Z, [x25]\n"
+ "dech x21\n"
+ "whilelt p6.h, XZR, x21\n"
+ "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p5.h, XZR, x21\n"
+ "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p4.h, XZR, x21\n"
+ "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p3.h, XZR, x21\n"
+ "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p2.h, XZR, x21\n"
+ "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n"
+ "dech x21\n"
+ "whilelt p1.h, XZR, x21\n"
+ "ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n"
+ "dech x21\n"
+ "mov x20, x23\n"
+ "ld1h { z24.h }, p0/Z, [x24]\n"
+ "whilelt p0.h, XZR, x21\n"
+ "dech x22, ALL, MUL #8\n"
+ "ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n"
+ "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n"
+ "cmp x22, #0x0\n"
+ "addvl x25, x25, #8\n"
+ "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n"
+ "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n"
+ "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n"
+ "st1h { z31.h }, p7, [x20]\n"
+ "addvl x24, x24, #8\n"
+ "st1h { z30.h }, p7, [x20, #1, MUL VL]\n"
+ "st1h { z29.h }, p7, [x20, #2, MUL VL]\n"
+ "st1h { z28.h }, p7, [x20, #3, MUL VL]\n"
+ "st1h { z27.h }, p7, [x20, #4, MUL VL]\n"
+ "st1h { z26.h }, p7, [x20, #5, MUL VL]\n"
+ "st1h { z25.h }, p7, [x20, #6, MUL VL]\n"
+ "st1h { z23.h }, p7, [x20, #7, MUL VL]\n"
+ "addvl x20, x20, #16\n"
+ "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n"
+ "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n"
+ "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n"
+ "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n"
+ "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n"
+ "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n"
+ "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n"
+ "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n"
+ "bgt 2b\n"
+ "3:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x2\n"
+ "addvl %x[out], %x[out], #16\n"
+ "bge 1b\n"
+ "cbz %x[height], 8f\n"
+ "4:" // Main loop skip
+ "5:" // Tail row loop: Head
+ "mov x25, %x[in]\n"
+ "add %x[in], x25, %x[in_stride]\n"
+ "mov x23, %x[out]\n"
+ "sub %x[height], %x[height], #0x1\n"
+ "mov x21, %x[width]\n"
+ "6:" // Tail row loop: Column loop
+ "mov x20, x21\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z23.h }, p0/Z, [x25]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n"
+ "dech x20\n"
+ "dech x21, ALL, MUL #8\n"
+ "whilelt p0.h, XZR, x20\n"
+ "cmp x21, #0x0\n"
+ "ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n"
+ "st1h { z23.h }, p7, [x23]\n"
+ "addvl x25, x25, #8\n"
+ "st1h { z22.h }, p7, [x23, #1, MUL VL]\n"
+ "st1h { z21.h }, p7, [x23, #2, MUL VL]\n"
+ "st1h { z20.h }, p7, [x23, #3, MUL VL]\n"
+ "st1h { z19.h }, p7, [x23, #4, MUL VL]\n"
+ "st1h { z18.h }, p7, [x23, #5, MUL VL]\n"
+ "st1h { z17.h }, p7, [x23, #6, MUL VL]\n"
+ "st1h { z16.h }, p7, [x23, #7, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "bgt 6b\n"
+ "7:" // Tail row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #8\n"
+ "bge 5b\n"
+ "8:" // Done
+ ".inst 0xd503467f // SMSTOP\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+ float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(float) / 2,
+ stride * sizeof(float),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+ bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(bfloat16) / 2,
+ stride * sizeof(bfloat16),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 1, true, VLType::SME>(
+ __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(__fp16) / 2,
+ stride * sizeof(__fp16),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
new file mode 100644
index 0000000000..45d2e24258
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+ uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+ if (height % 4) {
+ memset(pad_row, 0, width * sizeof(uint8_t));
+ }
+
+ size_t out_stride = 8 * roundup<size_t>(height, 4) * sme::get_vector_length<uint32_t>();
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p2.b\n"
+ "1:" // Main row loop: Head
+ "mov x26, %x[in]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x3\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "csel x24, x24, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "mov x22, %x[out]\n"
+ "csel x25, x25, %x[pad_row], GT\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "mov x21, %x[width]\n"
+ "2:" // Main row loop: Column loop
+ "mov x20, x21\n"
+ "whilelt p1.b, XZR, x20\n"
+ "ld1b { z19.b }, p1/Z, [x26]\n"
+ "decb x20\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [x25]\n"
+ "decw x21, ALL, MUL #8\n"
+ "cmp x21, #0x0\n"
+ "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n"
+ "addvl x26, x26, #2\n"
+ "addvl x25, x25, #2\n"
+ "ld1b { z16.b }, p1/Z, [x24]\n"
+ "zip1 z24.b, z19.b, z16.b\n"
+ "zip2 z20.b, z19.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n"
+ "zip1 z23.b, z17.b, z16.b\n"
+ "zip2 z22.b, z17.b, z16.b\n"
+ "addvl x24, x24, #2\n"
+ "ld1b { z16.b }, p1/Z, [x23]\n"
+ "zip1 z17.b, z18.b, z16.b\n"
+ "zip2 z19.b, z18.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n"
+ "zip1 z18.b, z21.b, z16.b\n"
+ "zip2 z21.b, z21.b, z16.b\n"
+ "addvl x23, x23, #2\n"
+ "zip1 z16.b, z24.b, z17.b\n"
+ "zip2 z17.b, z24.b, z17.b\n"
+ "st1b { z16.b }, p2, [x22]\n"
+ "zip1 z16.b, z20.b, z19.b\n"
+ "zip2 z20.b, z20.b, z19.b\n"
+ "st1b { z17.b }, p2, [x22, #1, MUL VL]\n"
+ "zip1 z19.b, z23.b, z18.b\n"
+ "zip2 z18.b, z23.b, z18.b\n"
+ "st1b { z16.b }, p2, [x22, #2, MUL VL]\n"
+ "zip1 z17.b, z22.b, z21.b\n"
+ "zip2 z16.b, z22.b, z21.b\n"
+ "st1b { z20.b }, p2, [x22, #3, MUL VL]\n"
+ "st1b { z19.b }, p2, [x22, #4, MUL VL]\n"
+ "st1b { z18.b }, p2, [x22, #5, MUL VL]\n"
+ "st1b { z17.b }, p2, [x22, #6, MUL VL]\n"
+ "st1b { z16.b }, p2, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "bgt 2b\n"
+ "3:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #8\n"
+ "bge 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+ uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_1x4(
+ reinterpret_cast<uint8_t *>(out),
+ reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(uint8_t) / 1,
+ stride * sizeof(uint8_t),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 4, true, VLType::SME>(
+ int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_1x4(
+ reinterpret_cast<uint8_t *>(out),
+ reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(int8_t) / 1,
+ stride * sizeof(int8_t),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
new file mode 100644
index 0000000000..ec7c415e27
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace {
+
+void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+ uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+ if (height % 2) {
+ memset(pad_row, 0, width * sizeof(uint16_t));
+ }
+
+ size_t out_stride = 8 * roundup<size_t>(height, 2) * sme::get_vector_length<uint16_t>();
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p4.b\n"
+ "1:" // Main row loop: Head
+ "mov x24, %x[in]\n"
+ "add x23, x24, %x[in_stride]\n"
+ "cmp %x[height], #0x1\n"
+ "add %x[in], x23, %x[in_stride]\n"
+ "mov x22, %x[out]\n"
+ "csel x23, x23, %x[pad_row], GT\n"
+ "sub %x[height], %x[height], #0x2\n"
+ "mov x21, %x[width]\n"
+ "2:" // Main row loop: Column loop
+ "mov x20, x21\n"
+ "whilelt p3.h, XZR, x20\n"
+ "ld1h { z20.h }, p3/Z, [x24]\n"
+ "dech x20\n"
+ "whilelt p2.h, XZR, x20\n"
+ "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p1.h, XZR, x20\n"
+ "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
+ "dech x20\n"
+ "whilelt p0.h, XZR, x20\n"
+ "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z17.h }, p3/Z, [x23]\n"
+ "decw x21, ALL, MUL #8\n"
+ "cmp x21, #0x0\n"
+ "zip1 z23.h, z20.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ "zip2 z22.h, z20.h, z17.h\n"
+ "zip1 z21.h, z19.h, z16.h\n"
+ "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+ "zip2 z20.h, z19.h, z16.h\n"
+ "zip1 z19.h, z18.h, z17.h\n"
+ "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "zip2 z18.h, z18.h, z17.h\n"
+ "zip1 z17.h, z24.h, z16.h\n"
+ "zip2 z16.h, z24.h, z16.h\n"
+ "st1h { z23.h }, p4, [x22]\n"
+ "st1h { z22.h }, p4, [x22, #1, MUL VL]\n"
+ "st1h { z21.h }, p4, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22, #3, MUL VL]\n"
+ "st1h { z19.h }, p4, [x22, #4, MUL VL]\n"
+ "st1h { z18.h }, p4, [x22, #5, MUL VL]\n"
+ "st1h { z17.h }, p4, [x22, #6, MUL VL]\n"
+ "st1h { z16.h }, p4, [x22, #7, MUL VL]\n"
+ "add x22, x22, %x[out_stride]\n"
+ "bgt 2b\n"
+ "3:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #8\n"
+ "bge 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+ bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_2x2(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(bfloat16) / 2,
+ stride * sizeof(bfloat16),
+ (kmax-k0)
+ );
+}
+
+template<>
+void Transform<8, 2, true, VLType::SME>(
+ __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sme_transpose_interleave_8VL_2x2(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+ (xmax-x0) * sizeof(__fp16) / 2,
+ stride * sizeof(__fp16),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
index 847718992a..f627fe575f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -373,4 +372,5 @@ void Transform<12, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
index 74fce4ddf9..b33c4f6c2d 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -101,7 +100,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 12f\n"
"6:" // Main loop skip
-
"7:" // Tail row loop: Head
"mov x21, %x[width]\n"
"cntw x20, ALL, MUL #2\n"
@@ -138,7 +136,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt
"addvl %x[out], %x[out], #1\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23"
@@ -160,4 +157,5 @@ void Transform<1, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
index a034be5e74..e468787815 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -305,4 +304,5 @@ void Transform<1, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
index 82d4184061..546800fa69 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -93,7 +92,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 8f\n"
"4:" // Main loop skip
-
"5:" // Tail row loop: Head
"mov x26, %x[in]\n"
"add %x[in], x26, %x[in_stride]\n"
@@ -123,7 +121,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt
"addvl %x[out], %x[out], #3\n"
"bge 5b\n"
"8:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
@@ -171,4 +168,5 @@ void Transform<3, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
index ec7095db7b..a44141c109 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -329,7 +328,6 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi
"addvl %x[out], %x[out], #3\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -364,4 +362,5 @@ void Transform<3, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
index 3d14383a64..36a15a16b3 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -292,7 +291,6 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #3\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -314,4 +312,5 @@ void Transform<3, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
index a39235187f..e661e2698a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -103,7 +102,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"bge 1b\n"
"cbz %x[height], 8f\n"
"4:" // Main loop skip
-
"5:" // Tail row loop: Head
"mov x26, %x[in]\n"
"add %x[in], x26, %x[in_stride]\n"
@@ -137,7 +135,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt
"addvl %x[out], %x[out], #4\n"
"bge 5b\n"
"8:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -185,4 +182,5 @@ void Transform<4, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
index e3489398d4..03a78f72f1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -317,4 +316,5 @@ void Transform<4, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
index 9505dc5e6d..b196799cfe 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -248,7 +247,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"bge 1b\n"
"cbz %x[height], 12f\n"
"6:" // Main loop skip
-
"7:" // Tail row loop: Head
"mov x12, %x[in]\n"
"mov x21, %x[width]\n"
@@ -323,7 +321,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #4\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -345,4 +342,5 @@ void Transform<4, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
index 982c0545ed..68fe2d0cbe 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -292,4 +291,5 @@ void Transform<6, 8, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
index 2b5741a49c..910fc6cb02 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -260,7 +259,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"bge 1b\n"
"cbz %x[height], 12f\n"
"6:" // Main loop skip
-
"7:" // Tail row loop: Head
"mov x12, %x[in]\n"
"add x11, x12, %x[in_stride]\n"
@@ -386,7 +384,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #6\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -408,4 +405,5 @@ void Transform<6, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
index 146da33869..f0f10d2f43 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -235,4 +234,5 @@ void Transform<6, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
index f6fc5e8b84..c638eaacde 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -296,7 +295,6 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t
"addvl %x[out], %x[out], #6\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -318,4 +316,5 @@ void Transform<6, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
index 07147acd8e..0526bd0596 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -281,7 +280,6 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt
"addvl %x[out], %x[out], #8\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
: "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -303,4 +301,5 @@ void Transform<8, 1, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
index 3ba50fee60..98f0770d77 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -283,4 +282,5 @@ void Transform<8, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
index 6b5ca38ab1..3fa5292143 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -256,4 +255,5 @@ void Transform<8, 8, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
index 237e9b684f..02977ecf1e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -354,7 +353,6 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #8\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -376,4 +374,5 @@ void Transform<8, 2, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
index 51cae7dd5a..34799c60a6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -439,7 +438,6 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t
"addvl %x[out], %x[out], #8\n"
"bge 7b\n"
"12:" // Done
-
: [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
: [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
@@ -461,4 +459,5 @@ void Transform<8, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
index 4ad882870e..5a48e579ae 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
@@ -24,8 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace {
@@ -279,4 +278,5 @@ void Transform<8, 4, true, VLType::SVE>(
);
}
-#endif
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)