COMPMID-3708 Remove OpenCL padding: CLCopyKernel [Patch1]

* Remove padding only for when user-supplied padding is empty * Vectorize the case where output_window is not null and the output window is narrow in x (smaller than vec_size_x) Change-Id: I313089fe309e87e8529ecfd00542fcfa4dc44862 Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4193 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2020-10-13 17:00:06 +0100
committer: SiCong Li <sicong.li@arm.com> 2020-10-22 13:47:59 +0000
commit: 40192c1d1b092130dbb6773a56857f354bc7746a (patch)
tree: d25772310b78a43dc8e16102fd0dfaefff00817f /src/core/CL/cl_kernels
parent: 410bca42f560c87d4860dc5ae7374437ded2cd76 (diff)
download: ComputeLibrary-40192c1d1b092130dbb6773a56857f354bc7746a.tar.gz
1 files changed, 14 insertions, 16 deletions
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 0592e07511..95da9a3cd3 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -77,9 +77,14 @@ __kernel void copy_pad_tensor(
 }
 #endif // Compile time constants
 
-#if defined(DATA_TYPE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** Performs a copy of input tensor to the output tensor.
  *
+ * @note The following variables must be passed at compile time:
+ * -# -DDATA_TYPE        : Input and output datatypes.
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
  * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
@@ -104,25 +109,18 @@ __kernel void copy_tensor(
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-#if defined(VEC_SIZE)
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does then shift access vector to access elements within bounds
-    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    // Boundary-aware access:
+    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
+    // Shift all accesses other than the first to avoid accessing out of bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
     in.ptr -= shift * in.stride_x;
     out.ptr -= shift * out.stride_x;
-#endif // defined(LAST_ACCESSED_X)
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
 
-    // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)out.ptr);
-#else  // defined(VEC_SIZE)
-    *((__global DATA_TYPE *)(out.ptr)) = *((__global DATA_TYPE *)(in.ptr));
-#endif // defined(VEC_SIZE)
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif // defined(DATA_TYPE)
-\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
author	SiCong Li <sicong.li@arm.com>	2020-10-13 17:00:06 +0100
committer	SiCong Li <sicong.li@arm.com>	2020-10-22 13:47:59 +0000
commit	40192c1d1b092130dbb6773a56857f354bc7746a (patch)
tree	d25772310b78a43dc8e16102fd0dfaefff00817f /src/core/CL/cl_kernels
parent	410bca42f560c87d4860dc5ae7374437ded2cd76 (diff)
download	ComputeLibrary-40192c1d1b092130dbb6773a56857f354bc7746a.tar.gz