diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/col2im.cl | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl index 98bf8d1ed4..5e52127f27 100644 --- a/src/core/CL/cl_kernels/col2im.cl +++ b/src/core/CL/cl_kernels/col2im.cl @@ -41,12 +41,15 @@ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320 * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600 * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4 + * @note In case of grouping the GROUPING flag must be passed at compile time using -DGROUPING * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) @@ -59,11 +62,14 @@ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) */ __kernel void col2im( - IMAGE_DECLARATION(src), + TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), uint dst_stride_w) { - Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor + const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor VEC_DATA_TYPE(DATA_TYPE, 8) data = vload8(0, (__global DATA_TYPE *)src.ptr); @@ -82,8 +88,16 @@ __kernel void col2im( __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes; - // Compute output offset - int idx = (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w; +#if defined(GROUPING) + // Compute output offset (batches on 4th dimension, no need to compute manually) + int idx = yd * dst_stride_y + xd * dst_stride_x; + + const uint group = get_global_id(2); // group ID + x_clamped += group * WIDTH_INPUT; +#else /* defined(GROUPING) */ + // Compute output offset (batches on 3rd dimension) + int idx = yd * dst_stride_y + xd * dst_stride_x + get_global_id(2) * dst_stride_w; +#endif /* GROUPING */ // Store value *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0; @@ -95,4 +109,4 @@ __kernel void col2im( *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6; *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7; } -#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
\ No newline at end of file +#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) |