aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/col2im.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/col2im.cl')
-rw-r--r--src/core/CL/cl_kernels/col2im.cl24
1 files changed, 19 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
index 98bf8d1ed4..5e52127f27 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/col2im.cl
@@ -41,12 +41,15 @@
* @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
* @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
* @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ * @note In case of grouping the GROUPING flag must be passed at compile time using -DGROUPING
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
@@ -59,11 +62,14 @@
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void col2im(
- IMAGE_DECLARATION(src),
+ TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
uint dst_stride_w)
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
+ const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
VEC_DATA_TYPE(DATA_TYPE, 8)
data = vload8(0, (__global DATA_TYPE *)src.ptr);
@@ -82,8 +88,16 @@ __kernel void col2im(
__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes;
- // Compute output offset
- int idx = (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
+#if defined(GROUPING)
+ // Compute output offset (batches on 4th dimension, no need to compute manually)
+ int idx = yd * dst_stride_y + xd * dst_stride_x;
+
+ const uint group = get_global_id(2); // group ID
+ x_clamped += group * WIDTH_INPUT;
+#else /* defined(GROUPING) */
+ // Compute output offset (batches on 3rd dimension)
+ int idx = yd * dst_stride_y + xd * dst_stride_x + get_global_id(2) * dst_stride_w;
+#endif /* GROUPING */
// Store value
*((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0;
@@ -95,4 +109,4 @@ __kernel void col2im(
*((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6;
*((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7;
}
-#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) \ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)