From 3a50166ff71f8379682fe6ece2a94b7a4bb3daa3 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Fri, 26 Jun 2020 10:02:06 +0100 Subject: COMPMID-3338 COMPMID-3336 COMPMID-3584 COMPMID-3338 Remove store padding in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel COMPMID-3336 Remove store padding in CLGEMMMatrixMultiplyNativeKernel COMPMID-3584 Fix VSTORE to correctly deal with scalar case * Implement STORE_BLOCK_BOUNDARY_AWARE, as part of the COMPMID-3332 investigation, with the following substantial changes: - Separate STORE_BLOCK_PARTIAL, STORE_ROW_PARTIAL and VSTORE_PARTIAL so that this change does not affect kernels not using STORE_BLOCK_BOUNDARY_AWARE. - Revamp vstore_ext_n to vstore_partial_n, and enhance VSTORE_PARTIAL to correctly handle both vector and scalar cases * Remove the store padding (dst tensor) in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel and CLGEMMMatrixMultiplyNativeKernel * Add configuration tests to check no padding is added by the configuration. Signed-off-by: SiCong Li Change-Id: I4f0907867979d8dacedd03b4bcbd2fb19e4f1602 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3522 Comments-Addressed: Arm Jenkins Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/helpers.h | 136 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) (limited to 'src/core/CL/cl_kernels/helpers.h') diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index f6795663df..9206110f63 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -255,6 +255,142 @@ #define vload1(OFFSET, PTR) *(OFFSET + PTR) #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA +/** Extended partial vstore that correctly handles scalar values as well. + * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops + * @name VSTORE_PARTIAL + * + * @note With this macro, the passed data can be both a vector and a scalar + * @note @p store_size needs to be <= @p size + * eg 1: Valid + * VSTORE_PARTIAL(16, 15) ...; + * eg 2: Invalid + * VSTORE_PARTIAL(4, 7) ...; + * + * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 + * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size + * @{ + */ +#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size +#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) + +// Size == 1 (scalar) +#define vstore_partial_1_1 vstore1 +// Size == 2 +#define vstore_partial_2_1 vstore_partial_1 +#define vstore_partial_2_2 vstore_partial_2 +// Size == 3 +#define vstore_partial_3_1 vstore_partial_1 +#define vstore_partial_3_2 vstore_partial_2 +#define vstore_partial_3_3 vstore_partial_3 +// Size == 4 +#define vstore_partial_4_1 vstore_partial_1 +#define vstore_partial_4_2 vstore_partial_2 +#define vstore_partial_4_3 vstore_partial_3 +#define vstore_partial_4_4 vstore_partial_4 +// Size == 8 +#define vstore_partial_8_1 vstore_partial_1 +#define vstore_partial_8_2 vstore_partial_2 +#define vstore_partial_8_3 vstore_partial_3 +#define vstore_partial_8_4 vstore_partial_4 +#define vstore_partial_8_5 vstore_partial_5 +#define vstore_partial_8_6 vstore_partial_6 +#define vstore_partial_8_7 vstore_partial_7 +#define vstore_partial_8_8 vstore_partial_8 +// Size == 16 +#define vstore_partial_16_1 vstore_partial_1 +#define vstore_partial_16_2 vstore_partial_2 +#define vstore_partial_16_3 vstore_partial_3 +#define vstore_partial_16_4 vstore_partial_4 +#define vstore_partial_16_5 vstore_partial_5 +#define vstore_partial_16_6 vstore_partial_6 +#define vstore_partial_16_7 vstore_partial_7 +#define vstore_partial_16_8 vstore_partial_8 +#define vstore_partial_16_9 vstore_partial_9 +#define vstore_partial_16_10 vstore_partial_10 +#define vstore_partial_16_11 vstore_partial_11 +#define vstore_partial_16_12 vstore_partial_12 +#define vstore_partial_16_13 vstore_partial_13 +#define vstore_partial_16_14 vstore_partial_14 +#define vstore_partial_16_15 vstore_partial_15 +#define vstore_partial_16_16 vstore_partial_16 + +/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops + * @name vstore_partial_n + * + * @note @p DATA needs to be a vector not a scalar + * @note n needs to be <= the vector width of the input variable @p DATA + * eg 1: Valid + * vstore_partial_15(var:float16, 0, 0xabcd); + * eg 2: Invalid + * vstore_partial_7(var:float4, 0, 0xabcd); + * + * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. + * + * @param[in] DATA The name of the variable + * @param[in] OFFSET Offset in n + * @param[in] PTR The base pointer + * @{ + */ +#define vstore_partial_1(DATA, OFFSET, PTR) \ + vstore1(DATA.s0, OFFSET, PTR); + +#define vstore_partial_2(DATA, OFFSET, PTR) \ + vstore2(DATA.s01, OFFSET, PTR); + +#define vstore_partial_3(DATA, OFFSET, PTR) \ + vstore3(DATA.s012, OFFSET, PTR); + +#define vstore_partial_4(DATA, OFFSET, PTR) \ + vstore4(DATA.s0123, OFFSET, PTR); + +#define vstore_partial_5(DATA, OFFSET, PTR) \ + vstore_partial_4(DATA.s0123, OFFSET, PTR); \ + vstore_partial_1(DATA.s4, OFFSET, PTR + 4); + +#define vstore_partial_6(DATA, OFFSET, PTR) \ + vstore_partial_4(DATA.s0123, OFFSET, PTR); \ + vstore_partial_2(DATA.s45, OFFSET, PTR + 4); + +#define vstore_partial_7(DATA, OFFSET, PTR) \ + vstore_partial_4(DATA.s0123, OFFSET, PTR); \ + vstore_partial_3(DATA.s456, OFFSET, PTR + 4); + +#define vstore_partial_8(DATA, OFFSET, PTR) \ + vstore8(DATA.s01234567, OFFSET, PTR); + +#define vstore_partial_9(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_1(DATA.s8, OFFSET, PTR + 8); + +#define vstore_partial_10(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_2(DATA.s89, OFFSET, PTR + 8); + +#define vstore_partial_11(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); + +#define vstore_partial_12(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); + +#define vstore_partial_13(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_5(DATA.s89abc, OFFSET, PTR + 8); + +#define vstore_partial_14(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_6(DATA.s89abcd, OFFSET, PTR + 8); + +#define vstore_partial_15(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_7(DATA.s89abcde, OFFSET, PTR + 8); + +#define vstore_partial_16(DATA, OFFSET, PTR) \ + vstore16(DATA, OFFSET, PTR); +/** @} */ // end of groupd vstore_partial_n +/** @} */ // end of groupd VSTORE_PARTIAL + // Convert built-in functions with _sat modifier are not supported in floating point so we create defines // without _sat to overcome this issue #define convert_float_sat convert_float -- cgit v1.2.1