From 1d08a310b7316f2b731e60ac36dc68989d15b546 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 3 Jan 2018 12:29:22 +0000 Subject: COMPMID-765: Collapse execution window in CL kernels. Updated following kernels to collapse their execution window and reduce number of kernel enqueues: -CLArithmeticAddition -CLArithmeticSubtraction -CLPixelWiseMultiplication Change-Id: I13d503515a20fa9be1401ead1e27e9bbc6627975 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114878 Reviewed-by: Anthony Barbier Tested-by: Anthony Barbier --- src/core/CL/cl_kernels/arithmetic_op.cl | 88 +++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 38 deletions(-) (limited to 'src/core/CL/cl_kernels') diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl index 03414105e6..12963473c5 100644 --- a/src/core/CL/cl_kernels/arithmetic_op.cl +++ b/src/core/CL/cl_kernels/arithmetic_op.cl @@ -35,40 +35,46 @@ #define SUB(x, y) (x) - (y) #endif /* SATURATE */ -/** This function add two images. +/** This function adds two tensors. * * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16/F16/F32 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/QS8/QS16/S16/F16/F32 + * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32 + * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32 + * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void arithmetic_add( - IMAGE_DECLARATION(in1), - IMAGE_DECLARATION(in2), - IMAGE_DECLARATION(out)) + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) { // Get pixels pointer - Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); - Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); - Image out = CONVERT_TO_IMAGE_STRUCT(out); + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); // Load values VEC_DATA_TYPE(DATA_TYPE_OUT, 16) @@ -80,40 +86,46 @@ __kernel void arithmetic_add( vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr); } -/** This function subtracts one image from another. +/** This function subtracts one tensors from another. * * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16 + * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16 + * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void arithmetic_sub( - IMAGE_DECLARATION(in1), - IMAGE_DECLARATION(in2), - IMAGE_DECLARATION(out)) + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) { // Get pixels pointer - Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); - Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); - Image out = CONVERT_TO_IMAGE_STRUCT(out); + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); // Load values VEC_DATA_TYPE(DATA_TYPE_OUT, 16) -- cgit v1.2.1