aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/helpers.h
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2021-08-26 14:47:55 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2021-09-01 22:11:21 +0000
commit50335fd3d0734157382741fcf1bfdaf630c60c4b (patch)
treeaa6a8e65cbd2245b84d5ca9c4df7ff5f6735e982 /src/core/CL/cl_kernels/helpers.h
parente2276cb18f9dffd484bc5d11f65f8526fb32d5bd (diff)
downloadComputeLibrary-50335fd3d0734157382741fcf1bfdaf630c60c4b.tar.gz
Remove padding from ClGemmMatrixMultiplyReshapedKernel
Create new macros for loading values from memory while being aware of boundaries of the tensor to not generate page faults. Resolves: COMPMID-4447 Change-Id: If9a455291e395ebd9070ebe5e120b3064d8fab29 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6168 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/helpers.h')
-rw-r--r--src/core/CL/cl_kernels/helpers.h208
1 files changed, 208 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 6cd76373d2..fae15b2347 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -202,6 +202,214 @@
#define VLOAD_STR(size) vload##size
#define VLOAD(size) VLOAD_STR(size)
+/** Extended partial vload that correctly handles scalar values as well.
+ * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
+ * @name VLOAD_PARTIAL
+ *
+ * @note With this macro, the passed data can be both a vector and a scalar
+ * @note @p load_size needs to be <= @p size
+ * eg 1: Valid
+ * VLOAD_PARTIAL(16, 15) ...;
+ * eg 2: Invalid
+ * VLOAD_PARTIAL(4, 7) ...;
+ *
+ * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] load_size The number of lower elements to load. Supported values: 1-16, but has to be <= @p size
+ * @{
+ */
+#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
+#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
+
+#define NO_LOAD(data, offs, ptr) \
+ { \
+ }
+
+// Size == 1 (scalar)
+#define vload_partial_1_0 NO_LOAD
+#define vload_partial_1_1 vload1
+#define vload_partial_1_2 NO_LOAD
+#define vload_partial_1_3 NO_LOAD
+#define vload_partial_1_4 NO_LOAD
+#define vload_partial_1_5 NO_LOAD
+#define vload_partial_1_6 NO_LOAD
+#define vload_partial_1_7 NO_LOAD
+#define vload_partial_1_8 NO_LOAD
+#define vload_partial_1_9 NO_LOAD
+#define vload_partial_1_10 NO_LOAD
+#define vload_partial_1_11 NO_LOAD
+#define vload_partial_1_12 NO_LOAD
+#define vload_partial_1_13 NO_LOAD
+#define vload_partial_1_14 NO_LOAD
+#define vload_partial_1_15 NO_LOAD
+#define vload_partial_1_16 NO_LOAD
+// Size == 2
+#define vload_partial_2_0 NO_LOAD
+#define vload_partial_2_1 vload_partial_1
+#define vload_partial_2_2 vload_partial_2
+#define vload_partial_2_3 NO_LOAD
+#define vload_partial_2_4 NO_LOAD
+#define vload_partial_2_5 NO_LOAD
+#define vload_partial_2_6 NO_LOAD
+#define vload_partial_2_7 NO_LOAD
+#define vload_partial_2_8 NO_LOAD
+#define vload_partial_2_9 NO_LOAD
+#define vload_partial_2_10 NO_LOAD
+#define vload_partial_2_11 NO_LOAD
+#define vload_partial_2_12 NO_LOAD
+#define vload_partial_2_13 NO_LOAD
+#define vload_partial_2_14 NO_LOAD
+#define vload_partial_2_15 NO_LOAD
+#define vload_partial_2_16 NO_LOAD
+// Size == 3
+#define vload_partial_3_0 NO_LOAD
+#define vload_partial_3_1 vload_partial_1
+#define vload_partial_3_2 vload_partial_2
+#define vload_partial_3_3 vload_partial_3
+#define vload_partial_3_4 NO_LOAD
+#define vload_partial_3_5 NO_LOAD
+#define vload_partial_3_6 NO_LOAD
+#define vload_partial_3_7 NO_LOAD
+#define vload_partial_3_8 NO_LOAD
+#define vload_partial_3_9 NO_LOAD
+#define vload_partial_3_10 NO_LOAD
+#define vload_partial_3_11 NO_LOAD
+#define vload_partial_3_12 NO_LOAD
+#define vload_partial_3_13 NO_LOAD
+#define vload_partial_3_14 NO_LOAD
+#define vload_partial_3_15 NO_LOAD
+#define vload_partial_3_16 NO_LOAD
+// Size == 4
+#define vload_partial_4_0 NO_LOAD
+#define vload_partial_4_1 vload_partial_1
+#define vload_partial_4_2 vload_partial_2
+#define vload_partial_4_3 vload_partial_3
+#define vload_partial_4_4 vload_partial_4
+#define vload_partial_4_5 NO_LOAD
+#define vload_partial_4_6 NO_LOAD
+#define vload_partial_4_7 NO_LOAD
+#define vload_partial_4_8 NO_LOAD
+#define vload_partial_4_9 NO_LOAD
+#define vload_partial_4_10 NO_LOAD
+#define vload_partial_4_11 NO_LOAD
+#define vload_partial_4_12 NO_LOAD
+#define vload_partial_4_13 NO_LOAD
+#define vload_partial_4_14 NO_LOAD
+#define vload_partial_4_15 NO_LOAD
+#define vload_partial_4_16 NO_LOAD
+// Size == 8
+#define vload_partial_8_0 NO_LOAD
+#define vload_partial_8_1 vload_partial_1
+#define vload_partial_8_2 vload_partial_2
+#define vload_partial_8_3 vload_partial_3
+#define vload_partial_8_4 vload_partial_4
+#define vload_partial_8_5 vload_partial_5
+#define vload_partial_8_6 vload_partial_6
+#define vload_partial_8_7 vload_partial_7
+#define vload_partial_8_8 vload_partial_8
+#define vload_partial_8_9 NO_LOAD
+#define vload_partial_8_10 NO_LOAD
+#define vload_partial_8_11 NO_LOAD
+#define vload_partial_8_12 NO_LOAD
+#define vload_partial_8_13 NO_LOAD
+#define vload_partial_8_14 NO_LOAD
+#define vload_partial_8_15 NO_LOAD
+#define vload_partial_8_16 NO_LOAD
+// Size == 16
+#define vload_partial_16_0 NO_LOAD
+#define vload_partial_16_1 vload_partial_1
+#define vload_partial_16_2 vload_partial_2
+#define vload_partial_16_3 vload_partial_3
+#define vload_partial_16_4 vload_partial_4
+#define vload_partial_16_5 vload_partial_5
+#define vload_partial_16_6 vload_partial_6
+#define vload_partial_16_7 vload_partial_7
+#define vload_partial_16_8 vload_partial_8
+#define vload_partial_16_9 vload_partial_9
+#define vload_partial_16_10 vload_partial_10
+#define vload_partial_16_11 vload_partial_11
+#define vload_partial_16_12 vload_partial_12
+#define vload_partial_16_13 vload_partial_13
+#define vload_partial_16_14 vload_partial_14
+#define vload_partial_16_15 vload_partial_15
+#define vload_partial_16_16 vload_partial_16
+
+/** Partial vload. Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vload ops
+ * @name vload_partial_n
+ *
+ * @note @p DATA needs to be a vector not a scalar
+ * @note n needs to be <= the vector width of the input variable @p DATA
+ * eg 1: Valid
+ * vload_partial_15(var:float16, 0, 0xabcd);
+ * eg 2: Invalid
+ * vload_partial_7(var:float4, 0, 0xabcd);
+ *
+ * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vload is invoked, thus there's no performance penalty.
+ *
+ * @param[in] DATA The name of the variable where to load the values
+ * @param[in] OFFSET Offset in n
+ * @param[in] PTR The base pointer
+ * @{
+ */
+#define vload_partial_1(DATA, OFFSET, PTR) \
+ DATA.s0 = vload1(OFFSET, PTR);
+
+#define vload_partial_2(DATA, OFFSET, PTR) \
+ DATA.s01 = vload2(OFFSET, PTR);
+
+#define vload_partial_3(DATA, OFFSET, PTR) \
+ DATA.s012 = vload3(OFFSET, PTR);
+
+#define vload_partial_4(DATA, OFFSET, PTR) \
+ DATA.s0123 = vload4(OFFSET, PTR);
+
+#define vload_partial_5(DATA, OFFSET, PTR) \
+ DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \
+ DATA.s4 = vload1(OFFSET, PTR + 4);
+
+#define vload_partial_6(DATA, OFFSET, PTR) \
+ DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \
+ DATA.s45 = vload_partial_2(DATA, OFFSET, PTR + 4);
+
+#define vload_partial_7(DATA, OFFSET, PTR) \
+ DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \
+ DATA.s456 = vload_partial_3(DATA, OFFSET, PTR + 4);
+
+#define vload_partial_8(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload8(OFFSET, PTR);
+
+#define vload_partial_9(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s8 = vload1(OFFSET, PTR + 8);
+
+#define vload_partial_10(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s89 = vload_partial_2(DATA, OFFSET, PTR + 8);
+
+#define vload_partial_11(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s89A = vload_partial_3(DATA, OFFSET, PTR + 8);
+
+#define vload_partial_12(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s89AB = vload_partial_4(DATA, OFFSET, PTR + 8);
+
+#define vload_partial_13(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s89ABC = vload_partial_5(DATA, OFFSET, PTR + 8);
+
+#define vload_partial_14(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s89ABCD = vload_partial_6(DATA, OFFSET, PTR + 8);
+
+#define vload_partial_15(DATA, OFFSET, PTR) \
+ DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
+ DATA.s89ABCDE = vload_partial_7(DATA, OFFSET, PTR + 8);
+
+#define vload_partial_16(DATA, OFFSET, PTR) \
+ DATA = vload16(OFFSET, PTR);
+/** @} */ // end of groupd vload_partial_n
+/** @} */ // end of groupd VLOAD_PARTIAL
+
#define PIXEL_UNIT4 1
#define PIXEL_UNIT8 2
#define PIXEL_UNIT16 4