diff options
author | SiCong Li <sicong.li@arm.com> | 2020-06-26 10:02:06 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2020-07-13 11:17:35 +0000 |
commit | 3a50166ff71f8379682fe6ece2a94b7a4bb3daa3 (patch) | |
tree | f7730d5eb40e7d6e431f77b9c96611538eba0559 /src/core/CL/cl_kernels/gemm.cl | |
parent | a0bf9130a266332eadb69139296c96c66d401e12 (diff) | |
download | ComputeLibrary-3a50166ff71f8379682fe6ece2a94b7a4bb3daa3.tar.gz |
COMPMID-3338 COMPMID-3336 COMPMID-3584
COMPMID-3338 Remove store padding in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
COMPMID-3336 Remove store padding in CLGEMMMatrixMultiplyNativeKernel
COMPMID-3584 Fix VSTORE to correctly deal with scalar case
* Implement STORE_BLOCK_BOUNDARY_AWARE, as part of the COMPMID-3332
investigation, with the following substantial changes:
- Separate STORE_BLOCK_PARTIAL, STORE_ROW_PARTIAL and VSTORE_PARTIAL
so that this change does not affect kernels not using STORE_BLOCK_BOUNDARY_AWARE.
- Revamp vstore_ext_n to vstore_partial_n, and enhance
VSTORE_PARTIAL to correctly handle both vector and scalar cases
* Remove the store padding (dst tensor) in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
and CLGEMMMatrixMultiplyNativeKernel
* Add configuration tests to check no padding is added by the
configuration.
Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: I4f0907867979d8dacedd03b4bcbd2fb19e4f1602
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3522
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/gemm.cl')
-rw-r--r-- | src/core/CL/cl_kernels/gemm.cl | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 3075739c5e..e3ce6bf0cd 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1016,6 +1016,8 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 * - N0 = 2, 3, 4, 8, 16 @@ -1286,7 +1288,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #endif // defined(ACTIVATION_TYPE) // Store output block - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -1308,6 +1310,8 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 * - N0 = 4, 8, 16 @@ -1628,7 +1632,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), #endif // defined(ACTIVATION_TYPE) // Store output block - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -1725,6 +1729,8 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 * - N0 = 2, 3, 4, 8, 16 @@ -2020,7 +2026,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), #endif // defined(ACTIVATION_TYPE) // Store output block - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -2042,6 +2048,8 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 * - N0 = 4, 8, 16 @@ -2325,7 +2333,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), #endif // defined(ACTIVATION_TYPE) // Store output block - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X @@ -3983,6 +3991,8 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs), * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2) * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) * @note Only the following configurations of M0, N0 and K0 are currently supported: * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 * - N0 = 2, 3, 4, 8, 16 @@ -4250,7 +4260,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), #endif // defined(ACTIVATION_TYPE) // Store output block - STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x); #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X |