aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-10-30 14:53:25 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2018-11-06 10:11:21 +0000
commit3704464b68ddd22739b38354de1721a3db4267b5 (patch)
tree0a8be84710a25a0b08df3c67dc6a3988ed6bc34b /src/core/CL
parent29421bd048ab67f893860e487c1b2b43305529f8 (diff)
downloadComputeLibrary-3704464b68ddd22739b38354de1721a3db4267b5.tar.gz
COMPMID-1703: Collapse the 4th dimensions in CLDepthWiseConvolutionLayer3x3Kernel
Change-Id: Ie274da79b15c03f86dfedc85bb721b3de34a0bb4
Diffstat (limited to 'src/core/CL')
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution.cl121
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl120
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp22
3 files changed, 178 insertions, 85 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 97b46c47cf..bfaa92be10 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -147,12 +147,12 @@ inline float2 convolution3x3(
/** This OpenCL kernel computes the depthwise convolution 3x3
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
@@ -271,12 +271,12 @@ __kernel void depthwise_convolution_3x3(
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 1
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
@@ -372,12 +372,12 @@ __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 2
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
@@ -419,8 +419,8 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
float2 pixels1 = 0.0f;
// Extract channel and linearized batch indices
- const int channel = get_global_id(2) % DST_CHANNELS;
- const int batch = get_global_id(2) / DST_CHANNELS;
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
@@ -753,14 +753,14 @@ inline half4 convolution3x3_f16(
/** This OpenCL kernel computes the depthwise convolution 3x3
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -826,14 +826,14 @@ __kernel void depthwise_convolution_3x3_f16(
/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
* when both stride_x and stride_y are equal to 1
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -930,14 +930,14 @@ __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
* when both stride_x and stride_y are equal to 2
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1043,14 +1043,16 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1058,6 +1060,8 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -1074,8 +1078,8 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
__kernel void depthwise_convolution_3x3_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -1084,11 +1088,20 @@ __kernel void depthwise_convolution_3x3_nhwc(
{
int x = get_global_id(0); // channels
int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -1158,9 +1171,14 @@ __kernel void depthwise_convolution_3x3_nhwc(
acc += bias_values;
#endif // defined(HAS_BIAS)
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
VSTORE(VEC_SIZE)
- (acc, 0, (__global DATA_TYPE *)(dst.ptr));
+ (acc, 0, (__global DATA_TYPE *)(dst_addr));
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
@@ -1174,14 +1192,16 @@ __kernel void depthwise_convolution_3x3_nhwc(
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1189,6 +1209,8 @@ __kernel void depthwise_convolution_3x3_nhwc(
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -1205,8 +1227,8 @@ __kernel void depthwise_convolution_3x3_nhwc(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
__kernel void depthwise_convolution_3x3_nhwc_stride1(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -1215,11 +1237,20 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
{
int x = get_global_id(0); // channels
int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -1340,7 +1371,11 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
acc3 += bias_values;
#endif // defined(HAS_BIAS)
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
(acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 421c8b6aab..5a732b4863 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -87,14 +87,14 @@
/** This function computes the depthwise convolution quantized.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -319,14 +319,14 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
#endif /* CONV_STRIDE_X */
/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -644,14 +644,16 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -659,6 +661,8 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -675,8 +679,8 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
* @param[in] max_offset Max offset for the input tensor
*/
__kernel void depthwise_convolution_3x3_quantized_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -685,11 +689,20 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
{
const int x = get_global_id(0); // channels
const int y = get_global_id(1); // spatial coordinate x
- const int z = get_global_id(2); // spatial coordinate y
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+ int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -799,9 +812,14 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
VSTORE(VEC_SIZE)
- (ACTIVATION_FUNC(res), 0, dst.ptr);
+ (ACTIVATION_FUNC(res), 0, dst_addr);
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
@@ -815,14 +833,16 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -830,6 +850,8 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -847,8 +869,8 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
*/
__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -857,11 +879,20 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
{
int x = get_global_id(0);
int y = get_global_id(1);
- int z = get_global_id(2);
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+ int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -1050,7 +1081,11 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
(ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
@@ -1080,14 +1115,16 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
* @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
* If not, the quantization will be performed using a fixed point multiplication
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1095,6 +1132,8 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -1112,8 +1151,8 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
*/
__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -1122,11 +1161,20 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
{
int x = get_global_id(0);
int y = get_global_id(1);
- int z = get_global_id(2);
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+ int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -1249,7 +1297,11 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
(ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 93d96dad1b..d76f5495f1 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -245,6 +245,8 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input,
build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_stride_x));
build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
}
+ build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
+ "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
// Create kernel
std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
@@ -291,8 +293,12 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window win = window;
- win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)), 1));
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
+
+ Window win = window_collapsed;
+ win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
// Create input window and adjust
Window win_in = win;
@@ -301,10 +307,10 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com
ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
- Window slice_in = win_in.first_slice_window_3D();
- Window slice_out = win.first_slice_window_3D();
+ Window slice_in = win_in.first_slice_window_4D();
+ Window slice_out = win.first_slice_window_4D();
- unsigned int idx = 3 * num_arguments_per_3D_tensor();
+ unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
if(_biases != nullptr)
{
@@ -321,11 +327,11 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
add_3D_tensor_argument(idx, _weights, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
}
- while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+ while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
}