aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2021-11-15 17:03:22 +0000
committerGiorgio Arena <giorgio.arena@arm.com>2021-11-17 10:41:30 +0000
commite330fb41d85d7058f74902ce1d47b2dc00b10a52 (patch)
treee40a80f1c081721410615ced57f71192fd158459
parente3b197410842652f0a78d04fe7b2c333cbeabab6 (diff)
downloadComputeLibrary-e330fb41d85d7058f74902ce1d47b2dc00b10a52.tar.gz
Improve start-up timer for ClIm2Col
Resolve COMPMID-4889 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: I4a88082b13865fdaeaba1b7216503cd640aa54df Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6680 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/nhwc/im2col.cl38
-rw-r--r--src/gpu/cl/kernels/ClIm2ColKernel.cpp6
2 files changed, 22 insertions, 22 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
index ac00c11283..a23e943fab 100644
--- a/src/core/CL/cl_kernels/nhwc/im2col.cl
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -22,23 +22,11 @@
* SOFTWARE.
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
-
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not support"
-#endif // ELEMENT_SIZE
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
+#define COND_N SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#if defined(IM2COL_3X3) || defined(IM2COL_9X9)
/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
* @name IM2COL1X9_NHWC_STORE
*
@@ -109,7 +97,9 @@
VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
(DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
/** @}*/
+#endif // defined(IM2COL_3X3) || defined(IM2COL_9X9)
+#if defined(IM2COL_3X3)
/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
*
* @note This kernel computes VECTOR_SIZE elements
@@ -269,7 +259,9 @@ __kernel void im2col3x3_nhwc(
}
#endif // HAS_BIAS
}
+#endif // defined(IM2COL_3X3)
+#if defined(IM2COL_9X9)
#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
#define IM2COL1x9(i) \
({ \
@@ -416,7 +408,9 @@ __kernel void im2col9x9_nhwc(
}
#endif // HAS_BIAS
}
+#endif // defined(IM2COL_9X9)
+#if defined(IM2COL_GENERIC)
/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
*
* @note This kernel computes VECTOR_SIZE elements
@@ -463,19 +457,20 @@ __kernel void im2col_generic_nhwc(
const int batch = get_global_id(2); // batch size
// Calculate input indices
- const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
- const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+ const int xi = (yo % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (yo / (int)CONVOLVED_WIDTH) * STRIDE_Y;
// Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+ const int stride_x = ch * sizeof(DATA_TYPE);
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + stride_x + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + stride_x + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
int i = 0;
for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
{
// Clamp yi_coord
int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
- yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+ yi_coord = clamp(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
// Out-of-bound condition for Y
int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
@@ -484,7 +479,7 @@ __kernel void im2col_generic_nhwc(
{
// Clamp xi_coord
int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
- xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+ xi_coord = clamp(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
// Out-of-bound condition for X
int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
@@ -528,5 +523,4 @@ __kernel void im2col_generic_nhwc(
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) \ No newline at end of file
+#endif // defined(IM2COL_GENERIC) \ No newline at end of file
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
index c42762b99c..6d1271d248 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -195,10 +195,16 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
if(kernel_dims == Size2D(3U, 3U))
{
kernel_name = "im2col3x3_";
+ build_opts.add_option("-DIM2COL_3X3");
}
else if(kernel_dims == Size2D(9U, 9U))
{
kernel_name = "im2col9x9_";
+ build_opts.add_option("-DIM2COL_9X9");
+ }
+ else
+ {
+ build_opts.add_option("-DIM2COL_GENERIC");
}
// Get boundary vector (the first/last vector with potentially a partial vector size) size