From bde2f3536a33e1d018d03c266cc9cd557f50e3e8 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 7 Sep 2021 14:15:28 +0100 Subject: Remove padding from ClGemmMatrixMultiplyReshapedKernel Create new macros for loading values from memory while being aware of boundaries of the tensor to not generate page faults. Resolves: COMPMID-4447 Signed-off-by: Giorgio Arena Change-Id: Ia5fd0a5dcb40942bccd5e686307d0055e1a1dd82 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6226 Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/helpers.h | 208 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) (limited to 'src/core/CL/cl_kernels/helpers.h') diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index 6cd76373d2..fae15b2347 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -202,6 +202,214 @@ #define VLOAD_STR(size) vload##size #define VLOAD(size) VLOAD_STR(size) +/** Extended partial vload that correctly handles scalar values as well. + * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops + * @name VLOAD_PARTIAL + * + * @note With this macro, the passed data can be both a vector and a scalar + * @note @p load_size needs to be <= @p size + * eg 1: Valid + * VLOAD_PARTIAL(16, 15) ...; + * eg 2: Invalid + * VLOAD_PARTIAL(4, 7) ...; + * + * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 + * @param[in] load_size The number of lower elements to load. Supported values: 1-16, but has to be <= @p size + * @{ + */ +#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size +#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) + +#define NO_LOAD(data, offs, ptr) \ + { \ + } + +// Size == 1 (scalar) +#define vload_partial_1_0 NO_LOAD +#define vload_partial_1_1 vload1 +#define vload_partial_1_2 NO_LOAD +#define vload_partial_1_3 NO_LOAD +#define vload_partial_1_4 NO_LOAD +#define vload_partial_1_5 NO_LOAD +#define vload_partial_1_6 NO_LOAD +#define vload_partial_1_7 NO_LOAD +#define vload_partial_1_8 NO_LOAD +#define vload_partial_1_9 NO_LOAD +#define vload_partial_1_10 NO_LOAD +#define vload_partial_1_11 NO_LOAD +#define vload_partial_1_12 NO_LOAD +#define vload_partial_1_13 NO_LOAD +#define vload_partial_1_14 NO_LOAD +#define vload_partial_1_15 NO_LOAD +#define vload_partial_1_16 NO_LOAD +// Size == 2 +#define vload_partial_2_0 NO_LOAD +#define vload_partial_2_1 vload_partial_1 +#define vload_partial_2_2 vload_partial_2 +#define vload_partial_2_3 NO_LOAD +#define vload_partial_2_4 NO_LOAD +#define vload_partial_2_5 NO_LOAD +#define vload_partial_2_6 NO_LOAD +#define vload_partial_2_7 NO_LOAD +#define vload_partial_2_8 NO_LOAD +#define vload_partial_2_9 NO_LOAD +#define vload_partial_2_10 NO_LOAD +#define vload_partial_2_11 NO_LOAD +#define vload_partial_2_12 NO_LOAD +#define vload_partial_2_13 NO_LOAD +#define vload_partial_2_14 NO_LOAD +#define vload_partial_2_15 NO_LOAD +#define vload_partial_2_16 NO_LOAD +// Size == 3 +#define vload_partial_3_0 NO_LOAD +#define vload_partial_3_1 vload_partial_1 +#define vload_partial_3_2 vload_partial_2 +#define vload_partial_3_3 vload_partial_3 +#define vload_partial_3_4 NO_LOAD +#define vload_partial_3_5 NO_LOAD +#define vload_partial_3_6 NO_LOAD +#define vload_partial_3_7 NO_LOAD +#define vload_partial_3_8 NO_LOAD +#define vload_partial_3_9 NO_LOAD +#define vload_partial_3_10 NO_LOAD +#define vload_partial_3_11 NO_LOAD +#define vload_partial_3_12 NO_LOAD +#define vload_partial_3_13 NO_LOAD +#define vload_partial_3_14 NO_LOAD +#define vload_partial_3_15 NO_LOAD +#define vload_partial_3_16 NO_LOAD +// Size == 4 +#define vload_partial_4_0 NO_LOAD +#define vload_partial_4_1 vload_partial_1 +#define vload_partial_4_2 vload_partial_2 +#define vload_partial_4_3 vload_partial_3 +#define vload_partial_4_4 vload_partial_4 +#define vload_partial_4_5 NO_LOAD +#define vload_partial_4_6 NO_LOAD +#define vload_partial_4_7 NO_LOAD +#define vload_partial_4_8 NO_LOAD +#define vload_partial_4_9 NO_LOAD +#define vload_partial_4_10 NO_LOAD +#define vload_partial_4_11 NO_LOAD +#define vload_partial_4_12 NO_LOAD +#define vload_partial_4_13 NO_LOAD +#define vload_partial_4_14 NO_LOAD +#define vload_partial_4_15 NO_LOAD +#define vload_partial_4_16 NO_LOAD +// Size == 8 +#define vload_partial_8_0 NO_LOAD +#define vload_partial_8_1 vload_partial_1 +#define vload_partial_8_2 vload_partial_2 +#define vload_partial_8_3 vload_partial_3 +#define vload_partial_8_4 vload_partial_4 +#define vload_partial_8_5 vload_partial_5 +#define vload_partial_8_6 vload_partial_6 +#define vload_partial_8_7 vload_partial_7 +#define vload_partial_8_8 vload_partial_8 +#define vload_partial_8_9 NO_LOAD +#define vload_partial_8_10 NO_LOAD +#define vload_partial_8_11 NO_LOAD +#define vload_partial_8_12 NO_LOAD +#define vload_partial_8_13 NO_LOAD +#define vload_partial_8_14 NO_LOAD +#define vload_partial_8_15 NO_LOAD +#define vload_partial_8_16 NO_LOAD +// Size == 16 +#define vload_partial_16_0 NO_LOAD +#define vload_partial_16_1 vload_partial_1 +#define vload_partial_16_2 vload_partial_2 +#define vload_partial_16_3 vload_partial_3 +#define vload_partial_16_4 vload_partial_4 +#define vload_partial_16_5 vload_partial_5 +#define vload_partial_16_6 vload_partial_6 +#define vload_partial_16_7 vload_partial_7 +#define vload_partial_16_8 vload_partial_8 +#define vload_partial_16_9 vload_partial_9 +#define vload_partial_16_10 vload_partial_10 +#define vload_partial_16_11 vload_partial_11 +#define vload_partial_16_12 vload_partial_12 +#define vload_partial_16_13 vload_partial_13 +#define vload_partial_16_14 vload_partial_14 +#define vload_partial_16_15 vload_partial_15 +#define vload_partial_16_16 vload_partial_16 + +/** Partial vload. Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vload ops + * @name vload_partial_n + * + * @note @p DATA needs to be a vector not a scalar + * @note n needs to be <= the vector width of the input variable @p DATA + * eg 1: Valid + * vload_partial_15(var:float16, 0, 0xabcd); + * eg 2: Invalid + * vload_partial_7(var:float4, 0, 0xabcd); + * + * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vload is invoked, thus there's no performance penalty. + * + * @param[in] DATA The name of the variable where to load the values + * @param[in] OFFSET Offset in n + * @param[in] PTR The base pointer + * @{ + */ +#define vload_partial_1(DATA, OFFSET, PTR) \ + DATA.s0 = vload1(OFFSET, PTR); + +#define vload_partial_2(DATA, OFFSET, PTR) \ + DATA.s01 = vload2(OFFSET, PTR); + +#define vload_partial_3(DATA, OFFSET, PTR) \ + DATA.s012 = vload3(OFFSET, PTR); + +#define vload_partial_4(DATA, OFFSET, PTR) \ + DATA.s0123 = vload4(OFFSET, PTR); + +#define vload_partial_5(DATA, OFFSET, PTR) \ + DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \ + DATA.s4 = vload1(OFFSET, PTR + 4); + +#define vload_partial_6(DATA, OFFSET, PTR) \ + DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \ + DATA.s45 = vload_partial_2(DATA, OFFSET, PTR + 4); + +#define vload_partial_7(DATA, OFFSET, PTR) \ + DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \ + DATA.s456 = vload_partial_3(DATA, OFFSET, PTR + 4); + +#define vload_partial_8(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload8(OFFSET, PTR); + +#define vload_partial_9(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s8 = vload1(OFFSET, PTR + 8); + +#define vload_partial_10(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s89 = vload_partial_2(DATA, OFFSET, PTR + 8); + +#define vload_partial_11(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s89A = vload_partial_3(DATA, OFFSET, PTR + 8); + +#define vload_partial_12(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s89AB = vload_partial_4(DATA, OFFSET, PTR + 8); + +#define vload_partial_13(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s89ABC = vload_partial_5(DATA, OFFSET, PTR + 8); + +#define vload_partial_14(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s89ABCD = vload_partial_6(DATA, OFFSET, PTR + 8); + +#define vload_partial_15(DATA, OFFSET, PTR) \ + DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \ + DATA.s89ABCDE = vload_partial_7(DATA, OFFSET, PTR + 8); + +#define vload_partial_16(DATA, OFFSET, PTR) \ + DATA = vload16(OFFSET, PTR); +/** @} */ // end of groupd vload_partial_n +/** @} */ // end of groupd VLOAD_PARTIAL + #define PIXEL_UNIT4 1 #define PIXEL_UNIT8 2 #define PIXEL_UNIT16 4 -- cgit v1.2.1