From 088d63aae947efd8bbcfd4d27c1f50a6af79e3b9 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 11 Aug 2020 14:14:06 +0100 Subject: COMPMID-3337: Remove write paddings in both axes from CLGEMMMatrixMultiplyReshapedKernel - Change the interface of STORE_BLOCK_BOUNDARY_AWARE passing the conditions on Y and X rather than the X/ coordinates. This allows to use the macro with both GEMM reshaped and GEMM reshaped rhs only - Remove padding from the output tensor of CLGEMMMatrixMultiplyReshapedKernel - Add tests for validating the zero padding requirement Change-Id: I13263cc71ce065c5be34ed198def320dd5823495 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3712 Tested-by: Arm Jenkins Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/gemm.cl | 68 +++++++++++---- src/core/CL/cl_kernels/gemm_helpers.h | 98 +++++++++++----------- .../kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp | 21 +++-- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 2 +- tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp | 90 ++++++++++++++++++++ 5 files changed, 204 insertions(+), 75 deletions(-) diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 8f5f8e3d07..4ad22ec830 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1278,8 +1278,11 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); #endif // defined(ACTIVATION_TYPE) + const bool cond_y = y == 0; + const bool cond_x = ((x + 1) * N0 >= N); + // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -1621,8 +1624,11 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); #endif // defined(ACTIVATION_TYPE) + const bool cond_y = y == 0; + const bool cond_x = ((x + 1) * N0 >= N); + // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -2014,8 +2020,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); #endif // defined(ACTIVATION_TYPE) + const bool cond_y = y == 0; + const bool cond_x = ((x + 1) * N0 >= N); + // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -2320,8 +2329,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); #endif // defined(ACTIVATION_TYPE) + const bool cond_y = y == 0; + const bool cond_x = ((x + 1) * N0 >= N); + // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -2519,6 +2531,8 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 2, 3, 4, 5, 6, 7, 8 * - N0 = 2, 3, 4, 8, 16 @@ -2755,11 +2769,15 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), #endif // defined(MIXED_PRECISION) #endif // defined(ACTIVATION_TYPE) + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + // Store output block #if defined(MIXED_PRECISION) - CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #else // defined(MIXED_PRECISION) - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #endif // defined(MIXED_PRECISION) #undef LHS_BLOCK_SIZE @@ -2791,6 +2809,8 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 2, 3, 4, 5, 6, 7, 8 * - N0 = 4, 8, 16 @@ -3019,11 +3039,15 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs), #endif // defined(MIXED_PRECISION) #endif // defined(ACTIVATION_TYPE) + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + // Store output block #if defined(MIXED_PRECISION) - CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #else // defined(MIXED_PRECISION) - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #endif // defined(MIXED_PRECISION) #undef LHS_BLOCK_SIZE @@ -3160,6 +3184,8 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs), * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 2, 3, 4, 8 * - N0 = 2, 3, 4, 8, 16 @@ -3480,7 +3506,8 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), #endif // defined(MIXED_PRECISION) #else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z; + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( + 2) * bias_stride_z; LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); @@ -3506,11 +3533,15 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), #endif // defined(MIXED_PRECISION) #endif // defined(ACTIVATION_TYPE) + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + // Store output block #if defined(MIXED_PRECISION) - CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #else // defined(MIXED_PRECISION) - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #endif // defined(MIXED_PRECISION) #undef LHS_BLOCK_SIZE @@ -3538,6 +3569,8 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 2, 3, 4, 8 * - N0 = 4, 8, 16 @@ -3867,11 +3900,15 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs), #endif // defined(MIXED_PRECISION) #endif // defined(ACTIVATION_TYPE) + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + // Store output block #if defined(MIXED_PRECISION) - CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #else // defined(MIXED_PRECISION) - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #endif // defined(MIXED_PRECISION) #undef LHS_BLOCK_SIZE @@ -4246,8 +4283,11 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); #endif // defined(ACTIVATION_TYPE) + const bool cond_y = y == 0; + const bool cond_x = ((x + 1) * N0 >= N); + // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index fada0302ff..6f6edc1bcf 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -1050,27 +1050,25 @@ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) * @param[in] N Total number of columns. Used to detect if current block is at the boundary in x. - * @param[in] y Global id of current block in y. Used to detect if current block is at the boundary in y. - * @param[in] x Global id of current block in x. Used to detect if current block is at the boundary in x. + * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. + * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \ - bool at_y_boundary = y == 0; \ - bool at_x_boundary = (x + 1) * N0 >= N; \ - if(!at_y_boundary && !at_x_boundary) \ - { \ - STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if(at_y_boundary && !at_x_boundary) \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if(!at_y_boundary && at_x_boundary) \ - { \ - STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ +#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } /** Store a block that can only be partial in x but not y. * @@ -1090,17 +1088,16 @@ * @param[in] Z The offset in z-axis direction * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) * @param[in] N Total number of columns. Used to detect if current block is at the boundary in x. - * @param[in] x Global id of current block in x. Used to detect if current block is at the boundary in x. + * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, x) \ - bool at_x_boundary = (x + 1) * N0 >= N; \ - if(!at_x_boundary) \ - { \ - STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else \ - { \ - STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ +#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X) \ + if(!(PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else \ + { \ + STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } /** Store a block that can only be partial in y but not x. * @@ -1119,17 +1116,16 @@ * @param[in] STRIDE_Y The stride value in y-axis direction * @param[in] Z The offset in z-axis direction * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) - * @param[in] y Global id of current block in y. Used to detect if current block is at the boundary in y. + * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ -#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, y) \ - bool at_y_boundary = y == 0; \ - if(!at_y_boundary) \ - { \ - STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ +#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ + if(!(PARTIAL_COND_Y)) \ + { \ + STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } /** @} */ // end of group STORE_BLOCK_PARTIAL @@ -1788,35 +1784,35 @@ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) * @param[in] N Total number of columns. Used to detect if current block is at the boundary in x. - * @param[in] y Global id of current block in y. Used to detect if current block is at the boundary in y. - * @param[in] x Global id of current block in x. Used to detect if current block is at the boundary in x. + * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. + * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. * @{ */ #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \ - STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, y) +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \ + STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \ - STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, x) +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \ + STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \ - STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \ + STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 #else // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) #endif // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) @@ -1845,4 +1841,4 @@ #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ ((uint)(y * M0)) #endif // defined(PARTIAL_STORE_M0) -/** @} */ // end of group COMPUTE_M0_START_ROW \ No newline at end of file +/** @} */ // end of group COMPUTE_M0_START_ROW diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp index 5a46a1e013..8f20de1ea1 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp @@ -159,23 +159,18 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor - // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic - const unsigned int m = gemm_info.m; - const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); AccessWindowStatic input0_access(input0, 0, 0, - ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y), + input0->dimension(0), input0->dimension(1)); AccessWindowStatic input1_access(input1, 0, 0, - ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), + input1->dimension(0), input1->dimension(1)); AccessWindowStatic output_access(output, 0, 0, - ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x), - output->dimension(1) + bottom_pad); + output->dimension(0), + output->dimension(1)); if(input2 != nullptr) { @@ -256,6 +251,12 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi const bool enable_mixed_precision = gemm_info.fp_mixed_precision; const DataType data_type = input0->info()->data_type(); + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); + + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + // Create build options CLBuildOptions build_opts; build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); @@ -286,6 +287,8 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); std::string kernel_name("gemm_mm_reshaped_"); kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index e65726b234..cf77c70bfa 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -162,7 +162,7 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe input0->dimension(0), input0->dimension(1)); AccessWindowStatic input1_access(input1, 0, 0, - ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), + input1->dimension(0), input1->dimension(1)); AccessWindowStatic output_access(output, 0, 0, output->dimension(0), diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp index 309a967abc..d7853f3ea7 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp @@ -170,11 +170,101 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { /** LHS transposed values */ const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } ); + +/** Zero padding test */ +bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, + unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value, + bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info, + DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta) +{ + const unsigned int M = m_value; + const unsigned int N = n_value; + const unsigned int K = k_value; + + GEMMLHSMatrixInfo lhs_info; + lhs_info.m0 = m0_value; + lhs_info.k0 = k0_value; + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = n0_value; + rhs_info.k0 = k0_value; + rhs_info.h0 = h0_value; + rhs_info.interleave = i_value_rhs; + rhs_info.transpose = t_value_rhs; + rhs_info.export_to_cl_image = export_to_cl_image; + + GEMMKernelInfo kernel_info; + kernel_info.m = M; + kernel_info.n = N; + kernel_info.k = K; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = act_info; + + const TensorShape lhs_shape(K, M, b_value); + const TensorShape rhs_shape(N, K, b_value); + const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt_input0), + lhs_info); + const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1), + rhs_info); + + const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape_reshaped, 1, dt_input0), + TensorInfo(rhs_shape_reshaped, 1, dt_input1), + kernel_info); + + const TensorShape bias_shape(N, + M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test + broadcast_bias? 1 : b_value); + + // Create tensors + CLTensor lhs_reshaped = create_tensor(lhs_shape_reshaped, dt_input0); + CLTensor rhs_reshaped = create_tensor(rhs_shape_reshaped, dt_input1); + CLTensor bias = create_tensor(bias_shape, dt_input2); + CLTensor dst = create_tensor(dst_shape, dt_output); + + ARM_COMPUTE_EXPECT(lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Validate zero-padding + CLGEMMMatrixMultiplyReshaped gemm; + + gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info); + + // Padding can be added along rhs and bias's X/Y dimension + return dst.info()->padding().empty() && lhs_reshaped.info()->padding().empty(); +} } // namespace TEST_SUITE(CL) TEST_SUITE(GEMMMatrixMultiplyReshaped) +/** Validate zero padding tests + * + * A series of validation tests to check the zero padding requirement + * + * Checks performed in order: + * - No partial blocks in both x and y dimensions + * - Partial blocks in x dimension + * - Partial blocks in y dimension + * - Partial blocks in both x and y dimensions + * - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1) + */ +DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip( +framework::dataset::make("M", { 24, 64, 101, 1, 103 }), +framework::dataset::make("N", { 48, 29, 16, 121, 41 })), +framework::dataset::make("M0", { 4, 8, 4, 2, 4 })), +framework::dataset::make("N0", { 4, 4, 16, 2, 16 })), +m_value, n_value, m0_value, n0_value) +{ + constexpr DataType dt = DataType::F32; + + bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f); + ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip( -- cgit v1.2.1