Implement FP32/FP16 MatMul NT/T kernel using the MMUL extension

Resolves COMPMID-6195 Signed-off-by: ramy.elgammal@arm.com <ramy.elgammal@arm.com> Change-Id: I8e85fe73308ed84ebb142d6d6d1562b62dddfaa5 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9819 Reviewed-by: SiCong Li <sicong.li@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Ramy Elgammal <ramy.elgammal@arm.com> 2023-05-19 14:23:37 +0100
committer: Ramy Elgammal <ramy.elgammal@arm.com> 2023-06-23 20:06:45 +0000
commit: c952596e70f2fe0073029f053e329a4e930ced8c (patch)
tree: 1cf9b1c87c2288d6af436b570802d9cc6e8b30b5
parent: 47a50ef12f513cfa8fde6673b8a61ed0f2d0fbaa (diff)
download: ComputeLibrary-c952596e70f2fe0073029f053e329a4e930ced8c.tar.gz
6 files changed, 253 insertions, 40 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 639f6f6c8b..fefeaa3232 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -44,6 +44,9 @@ v23.08 Public major release
  - Deprecate the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
  Users must no longer link their applications to this library and instead link only to the main `libarm_compute` library for core functionality.
  - Various optimizations and bug fixes.
+ - New features
+   - Add new OpenCL™ kernels:
+     - @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
 
 v23.05 Public major release
  - New features:
diff --git a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
index 1d94767b1b..71242062a8 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
@@ -33,8 +33,6 @@
  * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
  * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
  * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
- * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
- * @note The dimension K must be passed at compile time using -DK (e.g. -DK=4). K must be a multiple of MMUL_K0
  * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 > 0
@@ -65,13 +63,15 @@
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
  * @param[in]  M                                 Number of rows in LHS matrix
  * @param[in]  N                                 Number of columns in RHS matrix
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix, both not transposed.
  */
 __kernel void mat_mul_native_mmul_nt_nt(
     TENSOR3D_T(lhs, BUFFER),
     TENSOR3D_T(rhs, BUFFER),
     TENSOR3D_T(dst, BUFFER),
     const int M,
-    const int N)
+    const int N,
+    const int K)
 {
 #define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
 
@@ -189,3 +189,169 @@ __kernel void mat_mul_native_mmul_nt_nt(
 #undef MMUL_BLOCK_SIZE
 }
 #endif // defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                           Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                      Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                             The width of the lhs tensor
+ * @param[in]  lhs_h                             The height of the lhs tensor
+ * @param[in]  lhs_n                             Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                           Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                             The width of the rhs tensor
+ * @param[in]  rhs_h                             The height of the rhs tensor
+ * @param[in]  rhs_n                             Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[out] dst_ptr                           Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                             The width of the dst tensor
+ * @param[in]  dst_h                             The height of the dst tensor
+ * @param[in]  dst_n                             Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in]  M                                 Number of rows in LHS matrix
+ * @param[in]  N                                 Number of columns in RHS matrix
+ * @param[in]  K                                 Number of columns in LHS matrix and columns in RHS-Transposed matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+
+    const uint x0 = get_global_id(0); // (N / N0) * MMUL_M0
+    const uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint block_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint block_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + block_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + block_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_T)
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 408f1f7a21..5355cb7402 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -320,6 +320,7 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     { "l2_normalize_y", "common/l2_normalize.cl" },
     { "l2_normalize_z", "common/l2_normalize.cl" },
     { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" },
+    { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" },
     { "mat_mul_native_nt_nt", "common/mat_mul.cl" },
     { "mat_mul_native_nt_t", "common/mat_mul.cl" },
     { "mat_mul_native_t_nt", "common/mat_mul.cl" },
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
index 32e69cabda..06a0bdee17 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -60,12 +60,11 @@ inline std::pair<int, int> adjust_m0_n0(int m0, int n0, int m, int n)
 Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
 {
     const bool adj_lhs = matmul_kernel_info.adj_lhs;
-    const bool adj_rhs = matmul_kernel_info.adj_rhs;
-    const int  m0      = matmul_kernel_info.m0;
-    const int  n0      = matmul_kernel_info.n0;
-    const int  k0      = matmul_kernel_info.k0;
+    const int m0 = matmul_kernel_info.m0;
+    const int n0 = matmul_kernel_info.n0;
+    const int k0 = matmul_kernel_info.k0;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((adj_lhs || adj_rhs), "adj_lhs and adj_rhs are not supported yet");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((adj_lhs), "adj_lhs is not supported yet");
 
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
@@ -84,7 +83,7 @@ Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rh
 {
     ARM_COMPUTE_UNUSED(matmul_kernel_info);
     const size_t lhs_k = lhs_shape.x();
-    const size_t rhs_k = rhs_shape.y();
+    const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
@@ -177,9 +176,11 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
 
     const int m = dst->dimension(1);
     const int n = dst->dimension(0);
-    const int k = lhs->tensor_shape().x();
-    _m          = m;
-    _n          = n;
+    const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+    _m = m;
+    _n = n;
+    _k = k;
 
     int m0{};
     int n0{};
@@ -199,15 +200,15 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
     build_opts.add_option_if(lhs->data_type() == DataType::F16, "-DHALF_PRECISION");
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
     build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
     build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(k));
 
-    std::string kernel_name("mat_mul_native_mmul_nt_nt");
+    std::string kernel_name("mat_mul_native_mmul");
+    kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+    kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -250,6 +251,7 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window
     // Pass m and n at runtime as signed ints, to ensure results of any subtractions they could be operand in, would still be signed.
     _kernel.setArg<cl_int>(idx++, _m);
     _kernel.setArg<cl_int>(idx++, _n);
+    _kernel.setArg<cl_int>(idx++, _k);
 
     // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
     // LWS also enforces the order of execution of the work items which improves cache utilization
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
index 26fe08c466..79f675d03b 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -86,6 +86,7 @@ public:
 private:
     int _m{ 1 };
     int _n{ 1 };
+    int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/tests/validation/CL/MatMulNativeMMULKernel.cpp b/tests/validation/CL/MatMulNativeMMULKernel.cpp
index b33a4fae89..66e20d3c9d 100644
--- a/tests/validation/CL/MatMulNativeMMULKernel.cpp
+++ b/tests/validation/CL/MatMulNativeMMULKernel.cpp
@@ -61,6 +61,7 @@ const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3,
 
 /** N0 values to test --nightly*/
 const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 });
+const auto n0_values_nightly_rhs_t  = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
 
 /** K0 value -- Fixed to 1 */
 const auto k0_value = framework::dataset::make("K0", { 1 });
@@ -86,15 +87,22 @@ TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
             { MatMulKernelInfo(false, false, 3, 5, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
             { MatMulKernelInfo(false, false, 3, 6, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
             { MatMulKernelInfo(false, false, 3, 3, 4), false }, // K0 not 1
-            { MatMulKernelInfo(false, false, 9, 1, 2), true },
-            { MatMulKernelInfo(false, false, 3, 16, 3), true },
-            { MatMulKernelInfo(false, false, 7, 3, 4), true },
+            { MatMulKernelInfo(false, false, 9, 1, 1), true },
+            { MatMulKernelInfo(false, false, 3, 16, 1), true },
+            { MatMulKernelInfo(false, false, 7, 3, 1), true },
 
             // Lhs not-transposed, Rhs transposed
             // TODO: COMPMID-6195
 
             // Lhs transposed, Rhs-not-transposed
-            // TODO: COMPMID-6196
+            { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 2, 17, 1), false }, // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 4, 5, 1), false },  // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 4, 4, 7), false },  // K0 is not 1
+            { MatMulKernelInfo(false, true, 4, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 3, 8, 1), true },
+            { MatMulKernelInfo(false, true, 8, 16, 1), true },
+            { MatMulKernelInfo(false, true, 2, 4, 1), true },
 
             // Lhs transposed, Rhs-transposed
             // TODO: COMPMID-6197
@@ -110,6 +118,7 @@ TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
         {
             TensorInfo output_info;
             Status     status = ClMatMulNativeMMULKernel::validate(&lhs_info, &rhs_info, &output_info, pair.first);
+            ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
         }
     }
     else
@@ -149,7 +158,7 @@ TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
             {
                 for(bool adj_rhs :
                     {
-                        false // TODO: COMPMID-6195, COMPMID-6196, COMPMID-6197
+                        false, true
                     })
                 {
                     TensorShape lhs_shape = std::get<0>(tuple);
@@ -240,7 +249,7 @@ TEST_SUITE(FP32)
 TEST_SUITE(Buffer)
 FIXTURE_DATA_TEST_CASE(RunTiny, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulMMULDataset(),
                                                                                                                      framework::dataset::make("TransposeA", { false })),
-                                                                                                                     framework::dataset::make("TransposeB", { false })),
+                                                                                                                     framework::dataset::make("TransposeB", { false, true })),
                                                                                                                      m0_values_precommit),
                                                                                                                      n0_values_precommit),
                                                                                                                      k0_value),
@@ -255,7 +264,7 @@ FIXTURE_DATA_TEST_CASE(RunTiny, CLMatMulNativeMMULKernelFixture<float>, framewor
 }
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulMMULDataset(),
                                                                                                                       framework::dataset::make("TransposeA", { false })),
-                                                                                                                      framework::dataset::make("TransposeB", { false })),
+                                                                                                                      framework::dataset::make("TransposeB", { false, true })),
                                                                                                                       m0_values_precommit),
                                                                                                                       n0_values_precommit),
                                                                                                                       k0_value),
@@ -268,14 +277,30 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulNativeMMULKernelFixture<float>, framewo
         validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
     }
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
-                                                                                                                  framework::dataset::make("TransposeA", { false })),
-                                                                                                                  framework::dataset::make("TransposeB", { false })),
-                                                                                                                  m0_values_nightly_lhs_nt),
-                                                                                                                  n0_values_nightly_rhs_nt),
-                                                                                                                  k0_value),
-                                                                                                                  framework::dataset::make("ExportRhsToCLImage", { false })),
-                                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeRHSTranspose, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     if(_device_supports_mmul)
@@ -288,7 +313,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulNativeMMULKernelFixture<float>, framewo
 FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulMMULDataset(),
                                                                                framework::dataset::make("TransposeA", { false })),
-                                                                       framework::dataset::make("TransposeB", { false })),
+                                                                       framework::dataset::make("TransposeB", { false, true })),
                                                                framework::dataset::make("M0", { 2 })),
                                                        framework::dataset::make("N0", { 2 })),
                                                framework::dataset::make("K0", { 1 })),
@@ -309,7 +334,7 @@ TEST_SUITE(FP16)
 TEST_SUITE(Buffer)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulMMULDataset(),
                                                                                                                      framework::dataset::make("TransposeA", { false })),
-                                                                                                                     framework::dataset::make("TransposeB", { false })),
+                                                                                                                     framework::dataset::make("TransposeB", { false, true })),
                                                                                                                      m0_values_precommit),
                                                                                                                      n0_values_precommit),
                                                                                                                      k0_value),
@@ -322,14 +347,29 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulNativeMMULKernelFixture<half>, framewor
         validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
     }
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
-                                                                                                                 framework::dataset::make("TransposeA", { false })),
-                                                                                                                 framework::dataset::make("TransposeB", { false })),
-                                                                                                                 m0_values_nightly_lhs_nt),
-                                                                                                                 n0_values_nightly_rhs_nt),
-                                                                                                                 k0_value),
-                                                                                                                 framework::dataset::make("ExportRhsToCLImage", { false })),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRHSTranspose, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     if(_device_supports_mmul)
author	Ramy Elgammal <ramy.elgammal@arm.com>	2023-05-19 14:23:37 +0100
committer	Ramy Elgammal <ramy.elgammal@arm.com>	2023-06-23 20:06:45 +0000
commit	c952596e70f2fe0073029f053e329a4e930ced8c (patch)
tree	1cf9b1c87c2288d6af436b570802d9cc6e8b30b5
parent	47a50ef12f513cfa8fde6673b8a61ed0f2d0fbaa (diff)
download	ComputeLibrary-c952596e70f2fe0073029f053e329a4e930ced8c.tar.gz