From 738893e7216aa80bd7af28f3eb200b61def0368f Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Fri, 1 May 2020 12:55:16 +0100 Subject: COMPMID-3434 Add SIMD support in gemmlowp_mm_native * Add SIMD support in gemmlowp_mm_native for Midgard arch * Extend the validation test to include the corner case of m0=1, n0=1 and k0=1 Change-Id: I1f5e384b206a20e974932b8aa10cd628c69e5efc Signed-off-by: SiCong Li Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3142 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/core/CL/cl_kernels/gemmlowp.cl | 105 ++++++++++++++++++++- .../validation/CL/GEMMLowpMatrixMultiplyNative.cpp | 12 +-- 2 files changed, 108 insertions(+), 9 deletions(-) diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 080a6409eb..dd0928940c 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -140,7 +140,7 @@ ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ }) -/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */ +/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */ #define ARM_MM_K0XN0X1(n0, k0, a, b, c) \ ({ \ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \ @@ -199,6 +199,97 @@ (n0, k0, a, b, c); \ }) +/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */ +#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \ + }) +#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ + c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ + }) +#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ + }) +#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ + }) +#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ + c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ + c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ + c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ + }) +#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ + c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ + c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ + c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ + c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ + c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ + c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ + c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ + }) +/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */ +#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \ + }) +#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ + }) +#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ + }) +#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ + }) +#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ + }) +#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ + }) +#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ + }) +#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ + }) +#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MUL_N0X, k0) \ + (VECTOR_ACC_TYPE, (a), b, (c)); \ + }) +#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ + (VECTOR_ACC_TYPE, k0, a, b, c); \ + }) + #if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) /** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type. * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed @@ -949,11 +1040,15 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), // Load values from RHS matrix LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); + // Partial matrix multiplication M0,N0,K0 +#if(GPU_ARCH == GPU_ARCH_MIDGARD) + ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c); +#else // GPU_ARCH == GPU_ARCH_MIDGARD // Transpose the values from RHS matrix TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE); - // Partial matrix multiplication M0,N0,K0 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD // Update the offset lhs_offset += K0; @@ -969,11 +1064,15 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), // Load values from RHS matrix LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); + // Partial matrix multiplication M0,N0,1 +#if(GPU_ARCH == GPU_ARCH_MIDGARD) + ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c); +#else // GPU_ARCH == GPU_ARCH_MIDGARD // Transpose the values from RHS matrix TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE); - // Partial matrix multiplication M0,N0,1 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD // Update the offset lhs_offset += 1; diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp index 457d98ff8d..86c8f5a7a8 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -81,13 +81,13 @@ const auto n0_values_precommit = framework::dataset::make("N0", { 4 }); const auto k0_values_precommit = framework::dataset::make("K0", { 16 }); /** M0 values to test - Nightly */ -const auto m0_values_nightly = framework::dataset::make("M0", 2, 7); +const auto m0_values_nightly = framework::dataset::make("M0", 1, 2, 7); /** N0 values to test - Nightly */ -const auto n0_values_nightly = framework::dataset::make("N0", { 2, 3, 4, 8 }); +const auto n0_values_nightly = framework::dataset::make("N0", { 1, 2, 3, 4, 8 }); /** K0 values to test - Nightly */ -const auto k0_values_nightly = framework::dataset::make("K0", { 2, 3, 4, 8, 16 }); +const auto k0_values_nightly = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 }); } // namespace TEST_SUITE(CL) @@ -105,7 +105,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyNativeFixture, framewor validate(CLAccessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyNativeFixture, framework::DatasetMode::DISABLED, +FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyNativeFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(m_values, n_values), k_values), @@ -132,7 +132,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMLowpMatrixMultiplyNative3DFixture, fram validate(CLAccessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMLowpMatrixMultiplyNative3DFixture, framework::DatasetMode::DISABLED, +FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMLowpMatrixMultiplyNative3DFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(m_w_values, m_h_values), n_values), -- cgit v1.2.1