diff options
author | morgolock <pablo.tello@arm.com> | 2020-10-12 14:00:43 +0100 |
---|---|---|
committer | Pablo Marquez <pablo.tello@arm.com> | 2020-10-15 17:14:05 +0000 |
commit | cf343e3798d2a8c2ad2fcac488e4b78e2b5c968d (patch) | |
tree | 52aeb352689b82e34ff98730d52d970e79d3e7ff /src/core/CL/cl_kernels/gemmlowp.cl | |
parent | 3e77c27a07af070677a3a7e34fb3dfc519b7cbd1 (diff) | |
download | ComputeLibrary-cf343e3798d2a8c2ad2fcac488e4b78e2b5c968d.tar.gz |
COMPMID-3719: Remove OpenCL padding: CLGEMMLowpMatrixMultiplyNativeKernel
Change-Id: Iee28abcbba1e7b9e2f3aaa55685936dce815d5a3
Signed-off-by: morgolock <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4141
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/gemmlowp.cl')
-rw-r--r-- | src/core/CL/cl_kernels/gemmlowp.cl | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 8405a7beb7..29314ec581 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -992,10 +992,11 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), #endif // defined(DUMMY_WORK_ITEMS) // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; // Compute RHS matrix address - uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0; + uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); + #if defined(MATRIX_B_DEPTH) // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 @@ -1074,7 +1075,8 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), rhs_offset += rhs_stride_y; } - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y); + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; @@ -1092,9 +1094,12 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), dst_addr += z * dst_stride_z; #endif // defined(REINTERPRET_OUTPUT_AS_3D) + const bool cond_y = y == 0; + const bool cond_x = ((x + 1) * N0 >= N); - // Convert and store output block - CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); + + // Store output block + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x); } #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) |