diff options
author | SiCong Li <sicong.li@arm.com> | 2020-10-13 17:00:06 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2020-10-22 13:47:59 +0000 |
commit | 40192c1d1b092130dbb6773a56857f354bc7746a (patch) | |
tree | d25772310b78a43dc8e16102fd0dfaefff00817f /src/core/CL/cl_kernels | |
parent | 410bca42f560c87d4860dc5ae7374437ded2cd76 (diff) | |
download | ComputeLibrary-40192c1d1b092130dbb6773a56857f354bc7746a.tar.gz |
COMPMID-3708 Remove OpenCL padding: CLCopyKernel [Patch1]
* Remove padding only for when user-supplied padding is empty
* Vectorize the case where output_window is not null and the output
window is narrow in x (smaller than vec_size_x)
Change-Id: I313089fe309e87e8529ecfd00542fcfa4dc44862
Signed-off-by: SiCong Li <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4193
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/copy_tensor.cl | 30 |
1 files changed, 14 insertions, 16 deletions
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl index 0592e07511..95da9a3cd3 100644 --- a/src/core/CL/cl_kernels/copy_tensor.cl +++ b/src/core/CL/cl_kernels/copy_tensor.cl @@ -77,9 +77,14 @@ __kernel void copy_pad_tensor( } #endif // Compile time constants -#if defined(DATA_TYPE) +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) /** Performs a copy of input tensor to the output tensor. * + * @note The following variables must be passed at compile time: + * -# -DDATA_TYPE : Input and output datatypes. + * -# -DVEC_SIZE : The number of elements processed in X dimension + * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE + * * @param[in] in_ptr Pointer to the source tensor. Supported data types: All * @param[in] in_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in_step_x input_stride_x * number of elements along X processed per workitem(in bytes) @@ -104,25 +109,18 @@ __kernel void copy_tensor( Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in); Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); -#if defined(VEC_SIZE) - -#if defined(LAST_ACCESSED_X) - // Check if access on width gets out of bounds - // If it does then shift access vector to access elements within bounds - const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0); + // Boundary-aware access: + // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0): + // Shift all accesses other than the first to avoid accessing out of bounds + const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE; in.ptr -= shift * in.stride_x; out.ptr -= shift * out.stride_x; -#endif // defined(LAST_ACCESSED_X) // Load data VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr); + data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr); - // Store result - VSTORE(VEC_SIZE) - (data, 0, (__global DATA_TYPE *)out.ptr); -#else // defined(VEC_SIZE) - *((__global DATA_TYPE *)(out.ptr)) = *((__global DATA_TYPE *)(in.ptr)); -#endif // defined(VEC_SIZE) + // Boundary-aware store + STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } -#endif // defined(DATA_TYPE)
\ No newline at end of file +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file |