aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/im2col.cl
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2018-04-04 10:01:14 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:52:54 +0000
commit4a626a7d52e9c4759bdc16b65401a53779dd975f (patch)
tree82e203118f42f9b3c2e538c9b54d779f2a75d3af /src/core/CL/cl_kernels/im2col.cl
parente083771a1f28c34485f0d0054e2645070df98846 (diff)
downloadComputeLibrary-4a626a7d52e9c4759bdc16b65401a53779dd975f.tar.gz
COMPMID-801: NHWC support in CLIm2Col.
And extended tests coverage adding kernel shapes 3x1, 1x5 and 7x7 Change-Id: Ia7c1d4da2368d5f5fbc1a41187f4ac1aca5f150f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127727 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/im2col.cl')
-rw-r--r--src/core/CL/cl_kernels/im2col.cl202
1 files changed, 201 insertions, 1 deletions
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 1e85e1b303..f53ce21d05 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -123,7 +123,207 @@ __kernel void im2col1x1_stridex1_dchw(
}
#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+#define PTR_TO_VALUE(PTR, DATA_TYPE) *((DATA_TYPE *)(PTR))
+
#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int src_stride_y_int = (int)src_stride_y;
+ const int src_stride_z_int = (int)src_stride_z;
+ const int xc = get_global_id(1); // x coordinate in the convolved tensor
+ const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
+ const int ch = get_global_id(0); // input feature map
+ const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X - PAD_LEFT;
+ const int yi = yc * STRIDE_Y - PAD_TOP;
+
+ // Calculate output indices
+ const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH;
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+ {
+ const int y0 = yi + yk;
+ if(y0 >= 0 && y0 < SRC_HEIGHT)
+ {
+ int xk;
+ for(xk = 0; xk < KERNEL_WIDTH; xk++)
+ {
+ const int x0 = xi + xk;
+ if(x0 >= 0 && x0 < SRC_WIDTH)
+ {
+ *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + xk * src_stride_y + yk * src_stride_z, DATA_TYPE);
+ }
+ else
+ {
+ *((__global DATA_TYPE *)output_ptr) = PAD_VALUE;
+ }
+ output_ptr += 1 * sizeof(DATA_TYPE);
+ }
+ }
+ else
+ {
+ for(int xk = 0; xk < KERNEL_WIDTH; xk++)
+ {
+ *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE;
+ output_ptr += 1 * dst_stride_x;
+ }
+ }
+ }
+#ifdef HAS_BIAS
+ if(ch == (KERNEL_DEPTH - 1))
+ {
+ *((__global DATA_TYPE *)output_ptr) = 1.0f;
+ output_ptr += 1 * dst_stride_x;
+ }
+#endif // HAS_BIAS
+}
+
+/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int src_stride_y_int = (int)src_stride_y;
+ const int src_stride_z_int = (int)src_stride_z;
+ const int xc = get_global_id(1); // x coordinate in the convolved tensor
+ const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
+ const int ch = get_global_id(0); // input feature map
+ const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X - PAD_LEFT;
+ const int yi = yc * STRIDE_Y - PAD_TOP;
+
+ // Calculate output indices
+ const int xo = ch * 9; // 3x3
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
+
+ const int3 y = (int3)yi + (int3)(0, 1, 2);
+ // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer.
+ if(y.s0 >= 0 && y.s0 < SRC_HEIGHT)
+ {
+ row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
+ PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE),
+ PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE),
+ PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE));
+ }
+
+ if(y.s1 >= 0 && y.s1 < SRC_HEIGHT)
+ {
+ row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
+ PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
+ PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
+ PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE));
+ }
+
+ if(y.s2 >= 0 && y.s2 < SRC_HEIGHT)
+ {
+ row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
+ PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
+ PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
+ PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE));
+ }
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ // Put 0 if the value is out-of-bound
+ const int3 x = (int3)xi + (int3)(0, 1, 2);
+ VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+ cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+ row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+ row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0);
+ row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+ *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+ if(ch == (KERNEL_DEPTH - 1))
+ {
+ *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+
/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
@@ -804,4 +1004,4 @@ __kernel void im2col_reduced_dchw(
}
#endif // HAS_BIAS
}
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) \ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)