diff options
Diffstat (limited to 'src/core/CL/cl_kernels/helpers.h')
-rw-r--r-- | src/core/CL/cl_kernels/helpers.h | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index f6795663df..9206110f63 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -255,6 +255,142 @@ #define vload1(OFFSET, PTR) *(OFFSET + PTR) #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA +/** Extended partial vstore that correctly handles scalar values as well. + * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops + * @name VSTORE_PARTIAL + * + * @note With this macro, the passed data can be both a vector and a scalar + * @note @p store_size needs to be <= @p size + * eg 1: Valid + * VSTORE_PARTIAL(16, 15) ...; + * eg 2: Invalid + * VSTORE_PARTIAL(4, 7) ...; + * + * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16 + * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size + * @{ + */ +#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size +#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) + +// Size == 1 (scalar) +#define vstore_partial_1_1 vstore1 +// Size == 2 +#define vstore_partial_2_1 vstore_partial_1 +#define vstore_partial_2_2 vstore_partial_2 +// Size == 3 +#define vstore_partial_3_1 vstore_partial_1 +#define vstore_partial_3_2 vstore_partial_2 +#define vstore_partial_3_3 vstore_partial_3 +// Size == 4 +#define vstore_partial_4_1 vstore_partial_1 +#define vstore_partial_4_2 vstore_partial_2 +#define vstore_partial_4_3 vstore_partial_3 +#define vstore_partial_4_4 vstore_partial_4 +// Size == 8 +#define vstore_partial_8_1 vstore_partial_1 +#define vstore_partial_8_2 vstore_partial_2 +#define vstore_partial_8_3 vstore_partial_3 +#define vstore_partial_8_4 vstore_partial_4 +#define vstore_partial_8_5 vstore_partial_5 +#define vstore_partial_8_6 vstore_partial_6 +#define vstore_partial_8_7 vstore_partial_7 +#define vstore_partial_8_8 vstore_partial_8 +// Size == 16 +#define vstore_partial_16_1 vstore_partial_1 +#define vstore_partial_16_2 vstore_partial_2 +#define vstore_partial_16_3 vstore_partial_3 +#define vstore_partial_16_4 vstore_partial_4 +#define vstore_partial_16_5 vstore_partial_5 +#define vstore_partial_16_6 vstore_partial_6 +#define vstore_partial_16_7 vstore_partial_7 +#define vstore_partial_16_8 vstore_partial_8 +#define vstore_partial_16_9 vstore_partial_9 +#define vstore_partial_16_10 vstore_partial_10 +#define vstore_partial_16_11 vstore_partial_11 +#define vstore_partial_16_12 vstore_partial_12 +#define vstore_partial_16_13 vstore_partial_13 +#define vstore_partial_16_14 vstore_partial_14 +#define vstore_partial_16_15 vstore_partial_15 +#define vstore_partial_16_16 vstore_partial_16 + +/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops + * @name vstore_partial_n + * + * @note @p DATA needs to be a vector not a scalar + * @note n needs to be <= the vector width of the input variable @p DATA + * eg 1: Valid + * vstore_partial_15(var:float16, 0, 0xabcd); + * eg 2: Invalid + * vstore_partial_7(var:float4, 0, 0xabcd); + * + * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty. + * + * @param[in] DATA The name of the variable + * @param[in] OFFSET Offset in n + * @param[in] PTR The base pointer + * @{ + */ +#define vstore_partial_1(DATA, OFFSET, PTR) \ + vstore1(DATA.s0, OFFSET, PTR); + +#define vstore_partial_2(DATA, OFFSET, PTR) \ + vstore2(DATA.s01, OFFSET, PTR); + +#define vstore_partial_3(DATA, OFFSET, PTR) \ + vstore3(DATA.s012, OFFSET, PTR); + +#define vstore_partial_4(DATA, OFFSET, PTR) \ + vstore4(DATA.s0123, OFFSET, PTR); + +#define vstore_partial_5(DATA, OFFSET, PTR) \ + vstore_partial_4(DATA.s0123, OFFSET, PTR); \ + vstore_partial_1(DATA.s4, OFFSET, PTR + 4); + +#define vstore_partial_6(DATA, OFFSET, PTR) \ + vstore_partial_4(DATA.s0123, OFFSET, PTR); \ + vstore_partial_2(DATA.s45, OFFSET, PTR + 4); + +#define vstore_partial_7(DATA, OFFSET, PTR) \ + vstore_partial_4(DATA.s0123, OFFSET, PTR); \ + vstore_partial_3(DATA.s456, OFFSET, PTR + 4); + +#define vstore_partial_8(DATA, OFFSET, PTR) \ + vstore8(DATA.s01234567, OFFSET, PTR); + +#define vstore_partial_9(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_1(DATA.s8, OFFSET, PTR + 8); + +#define vstore_partial_10(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_2(DATA.s89, OFFSET, PTR + 8); + +#define vstore_partial_11(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); + +#define vstore_partial_12(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); + +#define vstore_partial_13(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_5(DATA.s89abc, OFFSET, PTR + 8); + +#define vstore_partial_14(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_6(DATA.s89abcd, OFFSET, PTR + 8); + +#define vstore_partial_15(DATA, OFFSET, PTR) \ + vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ + vstore_partial_7(DATA.s89abcde, OFFSET, PTR + 8); + +#define vstore_partial_16(DATA, OFFSET, PTR) \ + vstore16(DATA, OFFSET, PTR); +/** @} */ // end of groupd vstore_partial_n +/** @} */ // end of groupd VSTORE_PARTIAL + // Convert built-in functions with _sat modifier are not supported in floating point so we create defines // without _sat to overcome this issue #define convert_float_sat convert_float |