aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2020-05-01 12:55:16 +0100
committerSiCong Li <sicong.li@arm.com>2020-05-06 13:45:50 +0000
commit738893e7216aa80bd7af28f3eb200b61def0368f (patch)
treed819eee84f767e24904d98a0e4cdb2ed17310aa6
parent77b8859688c333bca35ebc8ca4d0b2652f480c4a (diff)
downloadComputeLibrary-738893e7216aa80bd7af28f3eb200b61def0368f.tar.gz
COMPMID-3434 Add SIMD support in gemmlowp_mm_native
* Add SIMD support in gemmlowp_mm_native for Midgard arch * Extend the validation test to include the corner case of m0=1, n0=1 and k0=1 Change-Id: I1f5e384b206a20e974932b8aa10cd628c69e5efc Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3142 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl105
-rw-r--r--tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp12
2 files changed, 108 insertions, 9 deletions
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 080a6409eb..dd0928940c 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -140,7 +140,7 @@
ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
})
-/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \
({ \
ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
@@ -199,6 +199,97 @@
(n0, k0, a, b, c); \
})
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
+ })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+ c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+ })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+ })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+ })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+ c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+ c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+ c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+ })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+ c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+ c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+ c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+ c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+ c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+ c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+ c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+ })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
+ })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_MUL_N0X, k0) \
+ (VECTOR_ACC_TYPE, (a), b, (c)); \
+ })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \
+ (VECTOR_ACC_TYPE, k0, a, b, c); \
+ })
+
#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
@@ -949,11 +1040,15 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
// Load values from RHS matrix
LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+ // Partial matrix multiplication M0,N0,K0
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+ ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
// Transpose the values from RHS matrix
TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
- // Partial matrix multiplication M0,N0,K0
ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
// Update the offset
lhs_offset += K0;
@@ -969,11 +1064,15 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
// Load values from RHS matrix
LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+ // Partial matrix multiplication M0,N0,1
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+ ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
// Transpose the values from RHS matrix
TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
- // Partial matrix multiplication M0,N0,1
ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
// Update the offset
lhs_offset += 1;
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
index 457d98ff8d..86c8f5a7a8 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,13 +81,13 @@ const auto n0_values_precommit = framework::dataset::make("N0", { 4 });
const auto k0_values_precommit = framework::dataset::make("K0", { 16 });
/** M0 values to test - Nightly */
-const auto m0_values_nightly = framework::dataset::make("M0", 2, 7);
+const auto m0_values_nightly = framework::dataset::make("M0", 1, 2, 7);
/** N0 values to test - Nightly */
-const auto n0_values_nightly = framework::dataset::make("N0", { 2, 3, 4, 8 });
+const auto n0_values_nightly = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
/** K0 values to test - Nightly */
-const auto k0_values_nightly = framework::dataset::make("K0", { 2, 3, 4, 8, 16 });
+const auto k0_values_nightly = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 });
} // namespace
TEST_SUITE(CL)
@@ -105,7 +105,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyNativeFixture, framewor
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyNativeFixture, framework::DatasetMode::DISABLED,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyNativeFixture, framework::DatasetMode::ALL,
combine(combine(combine(combine(combine(combine(m_values,
n_values),
k_values),
@@ -132,7 +132,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMLowpMatrixMultiplyNative3DFixture, fram
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMLowpMatrixMultiplyNative3DFixture, framework::DatasetMode::DISABLED,
+FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMLowpMatrixMultiplyNative3DFixture, framework::DatasetMode::ALL,
combine(combine(combine(combine(combine(combine(combine(m_w_values,
m_h_values),
n_values),