aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2020-08-10 21:44:14 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2020-08-11 13:26:10 +0000
commit73cdaac61d3121d4d6556846de259dd734afdccf (patch)
treea9d828cd3064d2c53d104495bdaf78e53f0e41f8
parentf650ea5be66be5c7b16faf5482e793e3a6d2430c (diff)
downloadComputeLibrary-73cdaac61d3121d4d6556846de259dd734afdccf.tar.gz
COMPMID-3335: Remove x/y-axis padding from CLGEMMReshapeLHSMatrixKernel
- Remove padding requirement for the input tensor of CLGEMMReshapeLHSMatrixKernel - Add utility function to load a boundary aware 2d tensor from buffer - Extend validation for validating the zero padding requirement Change-Id: I0ac6b1b517d75fd56998f406e0cce97b40918ce1 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3701 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/gemm.cl90
-rw-r--r--src/core/CL/cl_kernels/gemm_helpers.h260
-rw-r--r--src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp19
-rw-r--r--tests/validation/CL/GEMMReshapeLHSMatrix.cpp61
4 files changed, 371 insertions, 59 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index adb3a1c25d..8f5f8e3d07 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -24,7 +24,7 @@
#include "gemm_helpers.h"
#include "repeat.h"
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
@@ -43,13 +43,42 @@
({})
#endif // (SRC_WIDTH % K0)
+#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ ({ \
+ if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \
+ { \
+ if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
+ { \
+ LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
+ } \
+ else \
+ { \
+ LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
+ } \
+ } \
+ else \
+ { \
+ if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \
+ { \
+ LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
+ } \
+ else \
+ { \
+ LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
+ } \
+ } \
+ })
+
/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
* the output matrix unrolling the values.
*
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
* @note Only the following values for M0, K0 and V0 are supported:
* M0: 2,3,4,5,6,7,8
* K0: 2,3,4,8,16
@@ -141,29 +170,10 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
// ---------------------------Load input values --------------------------------
// Load values from the LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
- BOUNDARY_CONDITION_X(x, a0);
-#if M0 > 1
- BOUNDARY_CONDITION_X(x, a1);
-#endif // M0 > 1
-#if M0 > 2
- BOUNDARY_CONDITION_X(x, a2);
-#endif // M0 > 2
-#if M0 > 3
- BOUNDARY_CONDITION_X(x, a3);
-#endif // M0 > 3
-#if M0 > 4
- BOUNDARY_CONDITION_X(x, a4);
-#endif // M0 > 4
-#if M0 > 5
- BOUNDARY_CONDITION_X(x, a5);
-#endif // M0 > 5
-#if M0 > 6
- BOUNDARY_CONDITION_X(x, a6);
-#endif // M0 > 6
-#if M0 > 7
- BOUNDARY_CONDITION_X(x, a7);
-#endif // M0 > 7
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
+
+ LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
+
// ---------------------------Store output values ------------------------------
REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
@@ -248,8 +258,11 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
*
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
* @note Only the following values for M0, K0 and V0 are supported:
* M0: 2,3,4,5,6,7,8
* K0: 2,3,4,8,16
@@ -340,31 +353,10 @@ __kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
output_ptr += z * (uint)dst_stride_z;
// ---------------------------Load input values --------------------------------
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
+
+ LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
- // Load values from the LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
- BOUNDARY_CONDITION_X(x, a0);
-#if M0 > 1
- BOUNDARY_CONDITION_X(x, a1);
-#endif // M0 > 1
-#if M0 > 2
- BOUNDARY_CONDITION_X(x, a2);
-#endif // M0 > 2
-#if M0 > 3
- BOUNDARY_CONDITION_X(x, a3);
-#endif // M0 > 3
-#if M0 > 4
- BOUNDARY_CONDITION_X(x, a4);
-#endif // M0 > 4
-#if M0 > 5
- BOUNDARY_CONDITION_X(x, a5);
-#endif // M0 > 5
-#if M0 > 6
- BOUNDARY_CONDITION_X(x, a6);
-#endif // M0 > 6
-#if M0 > 7
- BOUNDARY_CONDITION_X(x, a7);
-#endif // M0 > 7
// ---------------------------Transpose and store block -----------------------
TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
@@ -396,7 +388,7 @@ __kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
#undef OUTPUT_OFFSET_X
#undef OUTPUT_STEP_X
}
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 5b6ad17ce0..fada0302ff 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -24,10 +24,268 @@
#include "activation_float_helpers.h"
#include "helpers.h"
+/** Utility macro to access a vector with the scalar positions
+ *
+ * Supported cases are: Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
+ *
+ * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
+ * @param[in] n0 The number of consecutive columns to access. n0 + offset must be <= 16
+ * @param[in] x Vector to access
+ * @{
+ */
+#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
+#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+
+// offset == 0
+#define scalar_access_0_1(x) ((x).s0)
+#define scalar_access_0_2(x) ((x).s01)
+#define scalar_access_0_3(x) ((x).s012)
+#define scalar_access_0_4(x) ((x).s0123)
+#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
+
+// offset == 1
+#define scalar_access_1_1(x) ((x).s1)
+#define scalar_access_1_2(x) ((x).s12)
+#define scalar_access_1_3(x) ((x).s123)
+#define scalar_access_1_4(x) ((x).s1234)
+#define scalar_access_1_8(x) ((x).s12345678)
+
+// offset == 2
+#define scalar_access_2_1(x) ((x).s2)
+#define scalar_access_2_2(x) ((x).s23)
+#define scalar_access_2_3(x) ((x).s234)
+#define scalar_access_2_4(x) ((x).s2345)
+#define scalar_access_2_8(x) ((x).s23456789)
+
+// offset == 3
+#define scalar_access_3_1(x) ((x).s3)
+#define scalar_access_3_2(x) ((x).s34)
+#define scalar_access_3_3(x) ((x).s345)
+#define scalar_access_3_4(x) ((x).s3456)
+#define scalar_access_3_8(x) ((x).s3456789A)
+
+// offset == 4
+#define scalar_access_4_1(x) ((x).s4)
+#define scalar_access_4_2(x) ((x).s45)
+#define scalar_access_4_3(x) ((x).s456)
+#define scalar_access_4_4(x) ((x).s4567)
+#define scalar_access_4_8(x) ((x).s456789AB)
+
+// offset == 8
+#define scalar_access_8_1(x) ((x).s8)
+#define scalar_access_8_2(x) ((x).s89)
+#define scalar_access_8_3(x) ((x).s89A)
+#define scalar_access_8_4(x) ((x).s89AB)
+#define scalar_access_8_8(x) ((x).s89ABCDEF)
+
+// offset == 12
+#define scalar_access_12_1(x) ((x).sC)
+#define scalar_access_12_2(x) ((x).sCD)
+#define scalar_access_12_3(x) ((x).sCDE)
+#define scalar_access_12_4(x) ((x).sCDEF)
+
+// offset == 16
+#define scalar_access_16_1(x) ((x).sF)
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) without allocating variables.
+ * @name LOAD_TENSOR_ROW_n
+ *
+ * @param[in] N0 The number of columns to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME The basename of the destination variables for the loaded rows
+ * @param[in] PTR The base pointer
+ * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ ({})
+
+#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @}*/ // end of group LOAD_TENSOR_ROW_n
+
+/** Load tensor (consecutive rows and columns) with Z offset.
+ * @name LOAD_TENSOR
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0 The number of consecutive rows
+ * @param[in] N0 The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME The basename of the result variables
+ * @param[in] PTR The base pointer for the data
+ * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
+ * @param[in] STRIDE_Y The stride in y-axis direction
+ * @param[in] Z The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_TENSOR
+
+/** Load 2D tensor (consecutive rows and columns) with Z offset.
+ * @name LOAD_TENSOR_M0Xn
+ *
+ * @param[in] M0 The number of rows to load [0-16]
+ * @param[in] N0 The number of columns to load [0-16]
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME The basename of the destination variables for the loaded rows
+ * @param[in] PTR The base pointer
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ ({})
+
+#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
+ LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+/** @}*/ // end of group LOAD_TENSOR_M0Xn
+
+/** Load 2D tensor (consecutive rows and columns) with Z offset.
+ * @name LOAD_TENSOR_M0XN0
+ *
+ * @param[in] M0 The number of consecutive rows [0-16]
+ * @param[in] N0 The number of consecutive columns [0-16]
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME The basename of the result variables
+ * @param[in] PTR The base pointer for the data
+ * @param[in] STRIDE_Y The stride in y-axis direction
+ * @param[in] Z The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+
/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
* @name LOAD_ROW_n
*
- * @param[in] N0 The number of rows to load
+ * @param[in] N0 The number of columns to load
* @param[in] DATA_TYPE The data type of variables
* @param[in] BASENAME The basename of the destination variables for the loaded rows
* @param[in] PTR The base pointer
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
index 5881576f45..156a657f28 100644
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
@@ -88,17 +88,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
// Configure window
- // Note: bottom paddings are calculated manually as the input can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input->tensor_shape()[1] * input->tensor_shape()[2] : input->tensor_shape()[1];
- const int bottom_pad = ceil_to_multiple(m, num_elems_processed_per_iteration_y) - m;
-
Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
AccessWindowStatic input_access(input, 0, 0,
- ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration_x),
- input->dimension(1) + bottom_pad);
+ input->dimension(0),
+ input->dimension(1));
AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
@@ -135,17 +130,25 @@ void CLGEMMReshapeLHSMatrixKernel::configure(const CLCompileContext &compile_con
_output = output;
_reinterpret_input_as_3d = reinterpret_input_as_3d;
+ const unsigned int src_w = input->info()->dimension(0);
+ const unsigned int src_h = _reinterpret_input_as_3d ? input->info()->dimension(1) * input->info()->dimension(2) : input->info()->dimension(1);
+ const unsigned int partial_load_m0 = src_h % lhs_info.m0;
+ const unsigned int partial_load_k0 = src_w % lhs_info.k0;
+
// Create build options
CLBuildOptions build_opts;
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+ build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
+ build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
std::string kernel_name("gemm_reshape_lhs_matrix_");
kernel_name += lhs_info.transpose ? "t" : "nt";
diff --git a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
index 985a8bb11b..d9439f63f1 100644
--- a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
@@ -81,10 +81,69 @@ const auto i_values = framework::dataset::make("interleave", { true, false });
/** Transpose values to test */
const auto t_values = framework::dataset::make("transpose", { true, false });
+
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int k0_value, unsigned int v0_value,
+ bool i_value_lhs, bool t_value_lhs, bool input_as_3d, DataType dt)
+{
+ const unsigned int M = m_value;
+ const unsigned int K = k_value;
+ const unsigned int B = b_value;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = m0_value;
+ lhs_info.k0 = k0_value;
+ lhs_info.v0 = v0_value;
+ lhs_info.interleave = i_value_lhs;
+ lhs_info.transpose = t_value_lhs;
+
+ const TensorShape lhs_shape(K, M, B);
+ const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt), lhs_info, input_as_3d);
+
+ // Create tensors
+ CLTensor lhs = create_tensor<CLTensor>(lhs_shape, dt);
+ CLTensor dst = create_tensor<CLTensor>(lhs_shape_reshaped, dt);
+
+ ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Validate zero-padding
+ CLGEMMReshapeLHSMatrixKernel lhs_reshape;
+
+ lhs_reshape.configure(&lhs, &dst, lhs_info, input_as_3d);
+
+ return lhs.info()->padding().empty();
+}
} // namespace
TEST_SUITE(CL)
TEST_SUITE(GEMMReshapeLHSMatrix)
+
+/** Validate zero padding tests for the LHS input tensor
+ *
+ * A series of validation tests to test the zero padding requirement
+ *
+ * Checks performed in order:
+ * - Case where M and K are smaller than M0 and K0
+ * - Generic test case with batch size = 1
+ * - Generic test case with batch size = 4
+ * - Generic test case with input_as_3d_value = true
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+framework::dataset::make("M", { 1, 23, 63, 101 }),
+framework::dataset::make("K", { 1, 47, 29, 27 })),
+framework::dataset::make("B", { 1, 1, 4, 7 })),
+framework::dataset::make("M0", { 4, 2, 4, 8 })),
+framework::dataset::make("K0", { 2, 2, 4, 8 })),
+framework::dataset::make("input_as_3d", { false, false, false, true })),
+m_value, k_value, b_value, m0_value, k0_value, input_as_3d_value)
+{
+ constexpr DataType dt = DataType::F32;
+
+ bool status = validate_zero_padding(m_value, k_value, b_value, m0_value, k0_value, 2, false, false, input_as_3d_value, dt);
+ ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrixFixture<int>, framework::DatasetMode::ALL,
combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
b_values),
@@ -172,4 +231,4 @@ TEST_SUITE_END() // GEMMReshapeLHSMatrix
TEST_SUITE_END() // CL
} // namespace validation
} // namespace test
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute